void finalizeTmatCommunication(LSMSCommunication &comm)
{
      // TODO: make it non-blocking ... beware of the memcpy & temp_buff
      shmem_quiet();
  /*
  MPI_Status status;
  for(int i=0; i<comm.numTmatFrom; i++)
  {
    int from=comm.tmatFrom[i].remoteNode;
    for(int j=0; j<comm.tmatFrom[i].numTmats; j++)
    {
      // printf("Finalize recieve request %d from node %d\n",j,from);
      MPI_Wait(&comm.tmatFrom[i].communicationRequest[j],&status);
    }
  }
  for(int i=0; i<comm.numTmatTo; i++)
  {
    int to=comm.tmatTo[i].remoteNode;
    for(int j=0; j<comm.tmatTo[i].numTmats; j++)
    {
      // printf("Finalize send request %d to node %d\n",j,to);
      MPI_Wait(&comm.tmatTo[i].communicationRequest[j],&status);
    }
  }
  */
}
Beispiel #2
0
int
main(int argc, char* argv[])
{
    int i, j, num_pes;
    int failed = 0;

    shmem_init();

    if (shmem_my_pe() == 0) {
        num_pes=shmem_n_pes();

        for(j = 0; j < num_pes; j++) {
            memset(target, 0, sizeof(long) * 10);
            shmem_long_get_nbi(target, source, 10, j);
            shmem_quiet();

            for (i = 0; i < 10; i++) {
                if (source[i] != target[i]) {
                    fprintf(stderr,"[%d] get_nbi from PE %d: target[%d] = %ld, expected %ld\n",
                            shmem_my_pe(), j, i, target[i], source[i]);
                    failed = 1;
                }
            }

            if (failed)
                shmem_global_exit(1);
        }
    }

    shmem_finalize();

    return 0;
}
Beispiel #3
0
static void
test_prepost(void)
{
    int i, j, k;

    tmp = 0;
    total = 0;

    shmem_barrier_all();

    for (i = 0 ; i < niters - 1 ; ++i) {
        cache_invalidate();

        shmem_barrier_all();

        tmp = timer();
        for (j = 0 ; j < npeers ; ++j) {
            for (k = 0 ; k < nmsgs ; ++k) {
                shmem_putmem(recv_buf + (nbytes * (k + j * nmsgs)), 
                             send_buf + (nbytes * (k + j * nmsgs)), 
                             nbytes, send_peers[npeers - j - 1]);
            }
        }
        shmem_quiet();
        shmem_short_wait((short*) (recv_buf + (nbytes * ((nmsgs - 1) + (npeers - 1) * nmsgs))), 0);
        total += (timer() - tmp);
        memset(recv_buf, 0, npeers * nmsgs * nbytes);
    }

    shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, world_size, reduce_pWrk, reduce_pSync);
    display_result("pre-post", (niters * npeers * nmsgs * 2) / (tmp / world_size));
}
Beispiel #4
0
static inline
int mca_atomic_basic_fop(void *target,
                         void *prev,
                         uint64_t value,
                         size_t size,
                         int pe,
                         struct oshmem_op_t *op)
{
    int rc = OSHMEM_SUCCESS;
    long long temp_value = 0;

    atomic_basic_lock(pe);

    rc = MCA_SPML_CALL(get(target, size, (void*)&temp_value, pe));

    memcpy(prev, (void*) &temp_value, size);

    op->o_func.c_fn((void*) value,
                    (void*) &temp_value,
                    size / op->dt_size);

    if (rc == OSHMEM_SUCCESS) {
        rc = MCA_SPML_CALL(put(target, size, (void*)&temp_value, pe));
        shmem_quiet();
    }

    atomic_basic_unlock(pe);

    return rc;
}
Beispiel #5
0
static void
test_one_way(void)
{
    int i, k;
    int pe_size  = world_size;

    tmp = 0;
    total = 0;

    shmem_barrier_all();

    if (world_size % 2 == 1) {
        pe_size = world_size - 1;
    }

    if (!(world_size % 2 == 1 && rank == (world_size - 1))) {
        if (rank < world_size / 2) {
            for (i = 0 ; i < niters ; ++i) {
                cache_invalidate();

                shmem_barrier(0, 0, pe_size, barrier_pSync);

                tmp = timer();
                for (k = 0 ; k < nmsgs ; ++k) {
                    shmem_putmem(recv_buf + (nbytes * k), 
                                 send_buf + (nbytes * k), 
                                 nbytes, rank + (world_size / 2));
                }
                shmem_quiet();
                total += (timer() - tmp);
            }
        } else {
            for (i = 0 ; i < niters ; ++i) {
                cache_invalidate();

                shmem_barrier(0, 0, pe_size, barrier_pSync);

                tmp = timer();
                shmem_short_wait((short*) (recv_buf + (nbytes * (nmsgs - 1))), 0);
                total += (timer() - tmp);
                memset(recv_buf, 0, npeers * nmsgs * nbytes);
            }
        }

        shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, pe_size, reduce_pWrk, reduce_pSync);
        display_result("single direction", (niters * nmsgs) / (tmp / world_size));
    }

    shmem_barrier_all();
}
int main(int argc, char **argv)
{
  int j;
  int my_pe,n_pes;
  int *flag,*one;
  size_t max_elements,max_elements_bytes;

  char *srce_char,*targ_char;
  short *srce_short,*targ_short;
  int *srce_int,*targ_int;
  long *srce_long,*targ_long;

  shmem_init();
  my_pe = shmem_my_pe();
  n_pes = shmem_n_pes();
  flag = shmem_malloc((size_t) sizeof(int));
  one  = shmem_malloc((size_t) sizeof(int));
  *one  = 1;

/*  fail if trying to use odd number of processors  */
  if ( (n_pes % 2) != 0 ){
        fprintf(stderr, "FAIL - test requires even number of PEs\n");
        exit(1);
  }

  if(my_pe == 0)
    fprintf(stderr, "shmem_num_put_nb(%s)\n", argv[0]);

/*  shmem_putmem_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(char));
  max_elements_bytes = (size_t) (sizeof(char)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_putmem_nb         max_elements = %d\n",max_elements);
  srce_char = shmem_malloc(max_elements_bytes);
  targ_char = shmem_malloc(max_elements_bytes);
  if((srce_char == NULL) || (targ_char == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++) 
      srce_char[j] = (char)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++) 
      targ_char[j] = (char)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_putmem_nb(targ_char,srce_char,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_char[j] != (char)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_char[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_char[j],my_pe+j-1);
  }
  shmem_free(srce_char);  shmem_free(targ_char);

/*  shmem_put16_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(short));
  if(max_elements > 20000) max_elements=20000;
  max_elements_bytes = (size_t) (sizeof(short)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put16_nb          max_elements = %d\n",max_elements);
  srce_short = shmem_malloc(max_elements_bytes);
  targ_short = shmem_malloc(max_elements_bytes);
  if((srce_short == NULL) || (targ_short == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++) 
      srce_short[j] = (short)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++) 
      targ_short[j] = (short)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put16_nb(targ_short,srce_short,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_short[j] != (short)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_short[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_short[j],my_pe+j-1);
  }
  shmem_free(srce_short);  shmem_free(targ_short);

/*  shmem_put32_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put32_nb          max_elements = %d\n",max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_int[j] = (int)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_int[j] = (int)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put32_nb(targ_int,srce_int,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_int[j] != (int)(my_pe+j-1) )
	fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_int[j],my_pe+j-1);
  }
  shmem_free(srce_int);  shmem_free(targ_int);
  
/*  shmem_put64_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put64_nb          max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put64_nb(targ_long,srce_long,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shmem_free(srce_long);  shmem_free(targ_long);

/*  shmem_put128_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  if ( (max_elements % 2) != 0)
    max_elements = max_elements-1;
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  max_elements = max_elements/2;
  if(my_pe == 0)
    fprintf(stderr,"shmem_put128_nb         max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < 2*max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < 2*max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put128_nb(targ_long,srce_long,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < 2*max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shmem_free(srce_long);  shmem_free(targ_long);

#ifdef SHMEM_C_GENERIC_32

/*  shmem_put_nb (GENERIC 32) test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put_nb (GENERIC 32)  max_elements = %d\n",max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_int[j] = (int)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_int[j] = (int)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put_nb(targ_int,srce_int,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_int[j] != (int)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_int[j],my_pe+j-1);
  }
  shmem_free(srce_int);  shmem_free(targ_int);

#else

/*  shmem_put_nb (GENERIC 64) test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put_nb (GENERIC 64)  max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put_nb(targ_long,srce_long,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shmem_free(srce_long);  shmem_free(targ_long);

#endif

#ifdef NEEDS_FINALIZE
  shmem_finalize(); 
#endif
  return 0;
}
int main(int argc, char **argv)
{
  const int ITER_CNT = 100;
  const long int MAX_MSG_SIZE = 1048576;
  int* source_addr;
  int peer;
  long int i=0, buff_size; 
  int j=0;
  long long int start_time, stop_time, res;
  double time;

  shmem_init();

  int pe_id = shmem_my_pe();
  source_addr = (int*) shmem_malloc(MAX_MSG_SIZE);

  if(pe_id == 1) {
      if(shmem_n_pes()!=4)
      	fprintf(stderr,"Num PEs should be ==4");
      printf("#Message Cnt;Time(s);MR(msgs/sec)\n");
  }

  if (pe_id==1)
	  peer = 3;
  else if(pe_id==3)
	  peer = 1;
  get_rtc_res_(&res);

  for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i += 1){
          pSync[i] = SHMEM_SYNC_VALUE;
  }

  /* Collective operation: Implicit barrier on return from attach */
  shmem_barrier_all();
  if(pe_id == 1 || pe_id == 3) {

  	for(buff_size=1; buff_size<=MAX_MSG_SIZE; buff_size*=2) {
	    isdone=0;
  	    shmem_barrier(1,1,2,pSync);
  	    get_rtc_(&start_time);
  	    for(j=1;j<=ITER_CNT;j++) {
		    shmem_putmem(source_addr, source_addr, buff_size, peer);
  	            shmem_quiet();
		    shmem_int_put(&isdone, &j, 1, peer);
  	            shmem_quiet();
		    shmem_int_wait(&isdone,j-1);
		    shmem_putmem(source_addr, source_addr, buff_size, peer);
  	            shmem_quiet();
  	    }
  	    shmem_barrier(1,1,2,pSync);
  	    get_rtc_(&stop_time);
  	    time = (stop_time - start_time)*1.0/(double)res/ITER_CNT;
  	    if(pe_id == 1) {
  	   	 printf("%20ld;%20.12f;%20.12f\n", 
  	                buff_size, time, (double)buff_size/time);
  	    }
  	    fflush(stdout);
  	}
  }

  shmem_barrier_all();
  shmem_finalize();

}
Beispiel #8
0
int
main(int argc, char* argv[])
{
    int me, num_procs, l, j;
    int Verbose = 0;

    start_pes(0);
    me = _my_pe();
    num_procs = _num_pes();
    if ( num_procs < 2 ) {
        if (me ==0)
            printf("PE[0] requires 2 or more PEs?\n");
        return 1;
    }

    for (l = 0 ; l < loops ; ++l) {

        if ((src_int = shmalloc(2*num_procs*sizeof(int))) == NULL) {
            printf("PE-%d int shmalloc() failed?\n", me);
            exit(1);
        }
        dst_int = &src_int[num_procs];
        for(j=0; j < num_procs; j++) {
            src_int[j] = 4;
            dst_int[j] = 0;
        }

        if ((src_float = shmalloc(2*num_procs*sizeof(float))) == NULL) {
            printf("PE-%d float shmalloc() failed?\n", me);
            exit(1);
        }
        dst_float = &src_float[num_procs];
        for(j=0; j < num_procs; j++) {
            src_float[j] = 4.0;
            dst_float[j] = 0.0;
        }

        if ((src_double = shmalloc(2*num_procs*sizeof(double))) == NULL) {
            printf("PE-%d double shmalloc() failed?\n", me);
            exit(1);
        }
        dst_double = &src_double[num_procs];
        for(j=0; j < num_procs; j++) {
            src_double[j] = 8.0;
            dst_double[j] = 0.0;
        }

        if ((src_long = shmalloc(2*num_procs*sizeof(long))) == NULL) {
            printf("PE-%d long shmalloc() failed?\n", me);
            exit(1);
        }
        dst_long = &src_long[num_procs];
        for(j=0; j < num_procs; j++) {
            src_long[j] = 8;
            dst_long[j] = 0;
        }

        if ((src_llong = shmalloc(2*num_procs*sizeof(long long))) == NULL) {
            printf("PE-%d long shmalloc() failed?\n", me);
            exit(1);
        }
        dst_llong = &src_llong[num_procs];
        for(j=0; j < num_procs; j++) {
            src_llong[j] = 16;
            dst_llong[j] = 0;
        }

        shmem_barrier_all();

        if ( me != 0 ) {
            /* is 'src_*' accessible from PE0? should be. */
            if (!shmem_addr_accessible(src_int,0)) {
                printf("PE-%d local src_int %p not accessible from PE-%d?\n",
                       me, (void*)src_int, 0);
                exit(1);
            }
            if (!shmem_addr_accessible(src_float,0)) {
                printf("PE-%d local src_float %p not accessible from PE-%d?\n",
                       me, (void*)src_float, 0);
                exit(1);
            }
            if (!shmem_addr_accessible(src_double,0)) {
                printf("PE-%d local src_double %p not accessible from PE-%d?\n",
                       me, (void*)src_double, 0);
                exit(1);
            }
            if (!shmem_addr_accessible(src_long,0)) {
                printf("PE-%d local src_long %p not accessible from PE-%d?\n",
                       me, (void*)src_long, 0);
                exit(1);
            }
            if (!shmem_addr_accessible(src_llong,0)) {
                printf("PE-%d local src_llong %p not accessible from PE-%d?\n",
                       me, (void*)src_llong, 0);
                exit(1);
            }
        }
        shmem_barrier_all();

        if ( me == 0 ) {
            shmem_quiet();
            for(j=1; j < num_procs; j++) {
                dst_int[j] = shmem_int_swap(src_int+j,0,j);
                if (dst_int[j] != 4) {
                    printf("PE-%d dst_int[%d] %d != 4?\n",me,j,dst_int[j]);
                    exit(1);
                }
            }
            shmem_barrier_all();

            /* verify remote data */
            for(j=1; j < num_procs; j++) {
                itmp = shmem_int_g(src_int+j,j);
                if (itmp != 0) {
                    printf("PE-0 int PE[%d] rem(%d) != 0?\n",j,itmp);
                    exit(1);
                }

                /* swap back */
                dst_int[j] = shmem_int_swap(src_int+j,dst_int[j],j);
                if (dst_int[j] != 0) {
                    printf("PE-0 dst_int[%d] %d != 0?\n",j,dst_int[j]);
                    exit(1);
                }

                itmp = shmem_int_g(src_int+j,j);
                if (itmp != 4) {
                    printf("PE-0 PE[%d] rem %d != 4?\n",j,itmp);
                    exit(1);
                }
            }

            for(j=1; j < num_procs; j++) {
                dst_float[j] = shmem_float_swap(src_float+j,0.0,j);
                if (dst_float[j] != 4.0) {
                    printf("PE-0 dst_float[%d] %f != 4.0?\n",j,dst_float[j]);
                    exit(1);
                }

                /* verify remote data */
                ftmp = shmem_float_g(src_float+j,j);
                if (ftmp != 0.0) {
                    printf("PE-0 float rem(%f) != 0.0?\n",ftmp);
                    exit(1);
                }
                /* swap back */
                dst_float[j] = shmem_float_swap(src_float+j,dst_float[j],j);
                if (dst_float[j] != 0.0) {
                    printf("PE-0 dst_float[%d] %f != 0.0?\n",j,dst_float[j]);
                    exit(1);
                }
                ftmp = shmem_float_g(src_float+j,j);
                if (ftmp != 4.0) {
                    printf("PE-%d float rem(%f) != 4.0?\n",me,ftmp);
                    exit(1);
                }
            }

            for(j=1; j < num_procs; j++) {
                dst_double[j] = shmem_double_swap(src_double+j,0.0,j);
                if (dst_double[j] != 8.0) {
                    printf("PE-0 dst_double[%d] %f != 8.0?\n",j,dst_double[j]);
                    exit(1);
                }
                /* verify remote data */
                dtmp = shmem_double_g(src_double+j,j);
                if (dtmp != 0.0) {
                    printf("PE-0 float rem(%f) != 0.0?\n",dtmp);
                    exit(1);
                }
                dst_double[j] = shmem_double_swap(src_double+j,dst_double[j],j);
                if (dst_double[j] != 0.0) {
                    printf("PE-0 dst_double[%d] %f != 0.0?\n",j,dst_double[j]);
                    exit(1);
                }
                dtmp = shmem_double_g(src_double+j,j);
                if (dtmp != 8.0) {
                    printf("PE-0 double rem(%f) != 8.0?\n",dtmp);
                    exit(1);
                }
            }

            for(j=1; j < num_procs; j++) {
                dst_long[j] = shmem_long_swap(src_long+j,0,j);
                if (dst_long[j] != 8) {
                    printf("PE-0 dst_long[%d] %ld != 8?\n",j,dst_long[j]);
                    exit(1);
                }
            }
            shmem_barrier_all();

            /* verify remote data */
            for(j=1; j < num_procs; j++) {
                ltmp = shmem_long_g(src_long+j,j);
                if (ltmp != 0) {
                    printf("PE-0 PE[%d]long rem(%ld) != 0?\n",j,ltmp);
                    exit(1);
                }
                /* swap back */
                dst_long[j] = shmem_long_swap(src_long+j,dst_long[j],j);
                if (dst_long[j] != 0) {
                    printf("PE-%d dst_long[%d] %ld != 0?\n",me,j,dst_long[j]);
                    exit(1);
                }
                ltmp = shmem_long_g(src_long+j,j);
                if (ltmp != 8) {
                    printf("PE-%d long rem(%ld) != 8?\n",me,ltmp);
                    exit(1);
                }
            }

            for(j=1; j < num_procs; j++) {
                dst_llong[j] = shmem_longlong_swap(src_llong+j,0,j);
                if (dst_llong[j] != 16) {
                    printf("PE-%d dst_llong[%d] %lld != 16?\n",me,j,dst_llong[j]);
                    exit(1);
                }
            }
            shmem_barrier_all();

            /* verify remote data */
            for(j=1; j < num_procs; j++) {
                lltmp = shmem_longlong_g(src_llong+j,j);
                if (lltmp != 0) {
                    printf("PE-%d long long rem(%lld) != 0?\n",me,lltmp);
                    exit(1);
                }
                /* swap back */
                dst_llong[j] = shmem_longlong_swap(src_llong+j,dst_llong[j],j);
                if (dst_llong[j] != 0) {
                    printf("PE-%d  dst_llong[%d] %lld != 0?\n", me,j,dst_llong[j]);
                    exit(1);
                }
                lltmp = shmem_longlong_g(src_llong+j,j);
                if (lltmp != 16) {
                    printf("PE-%d longlong rem(%lld) != 16?\n",me,lltmp);
                    exit(1);
                }
            }
        }
        else {
            shmem_int_wait_until(&src_int[me],SHMEM_CMP_EQ,0);
            shmem_barrier_all();

            shmem_long_wait_until(&src_long[me],SHMEM_CMP_EQ,0);
            shmem_barrier_all();

            shmem_longlong_wait_until(&src_llong[me],SHMEM_CMP_EQ,0);
            shmem_barrier_all();
        }

        shmem_barrier_all();

        shfree(src_int);
        shfree(src_float);
        shfree(src_double);
        shfree(src_long);
        shfree(src_llong);
    }

    if (Verbose)
        fprintf(stderr,"[%d] exit\n",_my_pe());

    return 0;
}
Beispiel #9
0
int hyperquick(int *A, int N, int npes){
	int pivot;
	int i;
	//the step two of algo.....broadcast the new pivot
	
	//pivot = quicksort(A, 0, n-1);
	next_pivot = A[N/2]; //the median	
	//shmem_barrier_all();
	//printf("(%d) N= %d\n",me,N);
	shmem_broadcast32(&next_pivot,&next_pivot,1,0,0,0,npes,pSync);	
	shmem_barrier_all();	
	
	/*printf("Process %d the pivot:%d",me, pivot);
	shmem_barrier_all(); //just for the sake of clear display...can be removed in the end
	printf("\nThe sorted list is of process %d: ",me);
	for(i=0;i<N/npes;i++){
		printf("%d,  ",A[i]);
		}
	printf("\n");*/
	
	printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots
	
	int check,j; //to check the division of the sorted arrays according to the new pivot.
	shmem_barrier_all();
	check = uplowPartition(A, next_pivot, N*npes, npes);
	shmem_barrier_all();	
	printf("(%d)",me);	
	for(int j=0;j<N;j++){
		printf("%d, ",A[j]);
		}
	printf("new partition: %d",check);
	
	shmem_barrier_all();	
	printf("\n");
	
	if(me < npes/2){
		printf("\n");
		pe = me +npes/2;
		nelems[0] = N - check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value

		printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value

		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,&A[check],nelems[0],pe);
	}

	shmem_barrier_all();//check if the entire barrier is needed
	if(me >= npes/ 2){
		
		pe = me-npes/2;//check if it is synced
		nelems[0]= check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value
		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,A,nelems[0],pe);
	}
	
	shmem_barrier_all();//again sync is required...check it with profiling
//this snippet is to check if the processors have got the high and low lists respectively	-------------------
		printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value
                printf("(%d) new elements = ",me);
                for(i=0;i<nelems_import[0];i++){
                        printf("%d, ",temp_arr[i]);
                }
                printf("\n");
//------------------------------------here this checking snippet ends----

//----------------------------------merging of arrays begin-------------------------
	if(me < npes/2){
		i=0;
		for(j=nelems_import[0];j<(nelems_import[0]+check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}
	

	if(me >= npes/2){
		i=check;
		for(j=nelems_import[0];j<(nelems_import[0]+N-check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}

	shmem_barrier_all(); //to test if the arrays are merged properly
	int size;
	if(me < npes/2){	
		size = (nelems_import[0]+check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
		printf("\n");
	}
		
	if(me >= npes/2){
		size = (nelems_import[0]+N-check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
	printf("\n");
	}
			//-----------------------check of merging finishes--------
//--------------------------------------------------merging finishes------------------------------

//-----------------------sort again-----------------------------------------------	
	if(me < npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+check-1));	
	}
	
	if(me >= npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+N-check-1));
	}
	//sorting routine checked...once program is done we can remove this part-------------
	shmem_barrier_all();//test purpose only
	if(me < npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
			A[i] = temp_arr[i];
		}
		printf("\n");
	}
	
	
	if(me >= npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
			A[i] = temp_arr[i];
		}
		printf("\n");
	}
	//-------------------------------------------------------------
//---------------------------------------------------------------------------------

	//hyperquick(A,size,npes/2);
	
}
Beispiel #10
0
JNIEXPORT void JNICALL Java_shmem_ShMem_quiet(JNIEnv *env, jclass clazz)
{
    shmem_quiet();
}
void communicateSingleAtomData(LSMSCommunication &comm, int from, int to, int &local_id, AtomData &atom, int tag)
{
  //The buffers used in this func are pre-allocated within initializeCommunication() of size 's' below 
  //int s=sizeof(AtomData)+sizeof(Real)*(2*3*MAXPTS+2*MAXCORE)+sizeof(int)*3*2*MAXCORE+sizeof(int);
  // 304 bytes transferred in each of the ITER_MAX iterations
  const int maxPts=MAXPTS;
  const int maxCore=MAXCORE;
  int t,i;
  static int count=0;
  const int ITER_MAX=1;
  int sec_id;

  if(comm.comm.rank==from)
  {

   for (i=0;i<ITER_MAX;i++){
    int pos=0;

    memcpy(&p2p_buf[pos],&local_id,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.jmt,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.jws,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.xstart,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.rmt,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],atom.header,80*char_size); pos+=80*char_size;
    memcpy(&p2p_buf[pos],&atom.alat,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.efermi,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.vdif,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.ztotss,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.zcorss,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],atom.evec,3*double_size); pos+=3*double_size;
    memcpy(&p2p_buf[pos],&atom.nspin,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.numc,int_size); pos+=int_size;

    t=atom.vr.n_row();

    memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.vr(0,0),2*t*double_size); pos+=2*t*double_size;
    memcpy(&p2p_buf[pos],&atom.rhotot(0,0),2*t*double_size); pos+=2*t*double_size;
    memcpy(&p2p_buf[pos],&atom.corden(0,0),2*t*double_size); pos+=2*t*double_size;

    t=atom.ec.n_row();

    memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.ec(0,0),2*t*double_size); pos+=2*t*double_size;
    memcpy(&p2p_buf[pos],&atom.nc(0,0),2*t*int_size); pos+=2*t*int_size;
    memcpy(&p2p_buf[pos],&atom.lc(0,0),2*t*int_size); pos+=2*t*int_size;
    memcpy(&p2p_buf[pos],&atom.kc(0,0),2*t*int_size); pos+=2*t*int_size;

    shmem_int_wait_until((sync_send_flag+to),_SHMEM_CMP_EQ,1);
    shmem_putmem(p2p_buf, p2p_buf, 1048576, to);
    shmem_int_add((sync_send_flag+to),-1,comm.comm.rank);
    shmem_int_add((sync_recv_flag+comm.comm.rank),1,to);
    shmem_quiet();

   }// end of false for loop
    
  }
  if(comm.comm.rank==to)
  {
for(i=0;i<ITER_MAX;i++) {
    int pos=0;

    sync_recv_flag[from]=0;
    shmem_int_add((sync_send_flag+comm.comm.rank),1,from);
    shmem_quiet();
    shmem_int_wait_until((sync_recv_flag+from),_SHMEM_CMP_EQ,1);
  
    memcpy(&local_id,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.jmt,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.jws,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.xstart,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.rmt,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(atom.header,&p2p_buf[pos],80*char_size); pos+=80*char_size;
    memcpy(&atom.alat,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.efermi,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.vdif,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.ztotss,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.zcorss,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(atom.evec,&p2p_buf[pos],3*double_size); pos+=3*double_size;
    memcpy(&atom.nspin,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.numc,&p2p_buf[pos],int_size); pos+=int_size;

    memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size;

    if(t!=atom.vr.n_row()) atom.resizePotential(t);

    memcpy(&atom.vr(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&atom.rhotot(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&atom.corden(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size;

    if(t!=atom.nc.n_row()) atom.resizeCore(t);

    memcpy(&atom.ec(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&atom.nc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size;
    memcpy(&atom.lc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size;
    memcpy(&atom.kc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size;
    shmem_int_add((sync_recv_flag+from),-1,comm.comm.rank);
    shmem_quiet();    
   }
  }

 
}
Beispiel #12
0
int main(int argc, char *argv[]){
	
		
	int i,n,next_pivot, pivot;	
	long pSync[_SHMEM_BCAST_SYNC_SIZE];
	
	for (i=0; i < SHMEM_BCAST_SYNC_SIZE; i++) {
 		pSync[i] = _SHMEM_SYNC_VALUE;
		}	
	
	start_pes(0);
	me = shmem_my_pe();
	npes = shmem_n_pes();
	shmem_barrier_all();
	srand (me+time(NULL));

	N = atoi(argv[1]);
	
	//int *nelems = (int*) shmalloc(sizeof(int));

	//int *nelems_import= (int*) shmalloc(sizeof(int));;
	printf("%d: Size = %d with np=%d\n",me,N,npes);
	A = (int *)shmalloc((N/npes)*sizeof(int));
	temp_arr = (int *)shmalloc((N/npes)*sizeof(int));
	if(A==NULL){
		printf("\nOut of memory");
		return 1;
	}
	n= N/npes;
	i=0;
	while(i<N/npes){
		A[i] = rand()%(10000-0);
		i++;
	}
	printf("\nprocess %d elements:",me);
	for(i=0;i<(N/npes);i++){
                printf("%d, ", A[i]);
       		}
	
	next_pivot = A[0];
	
	//the step two of algo.....broadcast the new pivot
	shmem_broadcast32(&next_pivot,A,1,0,0,0,npes,pSync);	
	shmem_barrier_all();	
	pivot = quicksort(A, 0, n-1);
	printf("Process %d the pivot:%d",me, pivot);
	
	shmem_barrier_all(); //just for the sake of clear display...can be removed in the end
	printf("\nThe sorted list is of process %d: ",me);
	for(i=0;i<n;i++){
		printf("%d,  ",A[i]);
		}
	printf("\n");
	printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots
	
	int check,j; //to check the division of the sorted arrays according to the new pivot.
	shmem_barrier_all();
	check = uplowPartition(next_pivot);
	shmem_barrier_all();	
	printf("(%d)",me);	
	for(int j=0;j<N/npes;j++){
		printf("%d, ",A[j]);
		}
	printf("new partition: %d",check);
	shmem_barrier_all();
	if(me < npes/2)
		{
		i=0;
	//	printf("Hello from %d", me);
		printf("\n");
		for(j=check;j<N/npes;j++){
			temp_arr[i] = A[j];
			i++;
			}
		i=0;
		printf("(%d)",me);
		for(j=check;j<N/npes;j++){
                        printf("%d, ",temp_arr[i]) ;
			i++;
                	}
//	printf("\n");
	}
	
	shmem_barrier_all();
	if(me >= npes/2)
		{
		
	//	printf("Hello from %d", me);
		printf("\n");
		for(j=0;j<check;j++){
			temp_arr[j] = A[j];
			}
		
		printf("(%d)",me);
		for(j=0;j<check;j++){
                        printf("%d, ",temp_arr[j]) ;
			
                	}
//	printf("\n");
	}
	shmem_barrier_all();	
	printf("\n");
	
	if(me < npes/2){
		printf("\n");
		pe = me +npes/2;
		nelems[0] = N/npes - check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value

		printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value

		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,&A[check],nelems[0],pe);
	}

	shmem_barrier_all();//check if the entire barrier is needed
	if(me >= npes/ 2){
		
		pe = me-npes/2;//check if it is synced
		nelems[0]= check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value
		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,A,nelems[0],pe);
	}
	
	shmem_barrier_all();//again sync is required...check it with profiling
//this snippet is to check if the processors have got the high and low lists respectively	-------------------
		printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value
                printf("(%d) new elements = ",me);
                for(i=0;i<nelems_import[0];i++){
                        printf("%d, ",temp_arr[i]);
                }
                printf("\n");
//------------------------------------here this checking snippet ends----

//----------------------------------merging of arrays begin-------------------------
	if(me < npes/2){
		i=0;
		for(j=nelems_import[0];j<(nelems_import[0]+check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}
	

	if(me >= npes/2){
		i=check;
		for(j=nelems_import[0];j<(nelems_import[0]+N/npes-check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}

	shmem_barrier_all(); //to test if the arrays are merged properly
	int size;
	if(me < npes/2){	
		size = (nelems_import[0]+check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
		printf("\n");
	}
		
	if(me >= npes/2){
		size = (nelems_import[0]+N/npes-check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
	printf("\n");
	}
			//-----------------------check of merging finishes--------
//--------------------------------------------------merging finishes------------------------------

//-----------------------sort again-----------------------------------------------	
	if(me < npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+check-1));	
	}
	
	if(me >= npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+N/npes-check-1));
	}
	//sorting routine checked...once program is done we can remove this part-------------
	shmem_barrier_all();//test purpose only
	if(me < npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
		}
		printf("\n");
	}
	
	
	if(me >= npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
		}
		printf("\n");
	}
	//-------------------------------------------------------------
//---------------------------------------------------------------------------------
	
shfree(temp_arr);
shfree(A);
shmem_finalize();
}
int main(int argc, char **argv)
{
  int i,j,iter;
  int my_pe,n_pes;
  int *flag,*one;
  size_t max_elements,max_elements_bytes;
  size_t elements[16] = {1,2,4,8,12,16,24,32,64,128,256,512,1024,2048,4096,8192};
  int num_elements = 16;

  short *srce_short,*targ_short;
  int *srce_int,*targ_int;
  long *srce_long,*targ_long;
  float *srce_float,*targ_float;
  double *srce_double,*targ_double;

  shmem_init();
  my_pe = shmem_my_pe();
  n_pes = shmem_n_pes();
  flag = shmem_malloc((size_t) sizeof(int));
  one  = shmem_malloc((size_t) sizeof(int));
  *one  = 1;

/*  fail if trying to use odd number of processors  */
  if ( (n_pes % 2) != 0 ){
        fprintf(stderr, "FAIL - test requires even number of PEs\n");
        exit(1);
  }

  if(my_pe == 0)
    fprintf(stderr, "shmem_both_put_nb_size(%s)\n", argv[0]);

/*  alloc arrays   */

  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_int_put_nb        max_elements = %d\n",max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();

  max_elements = (size_t) (MAX_SIZE / sizeof(short));
  max_elements_bytes = (size_t) (sizeof(short)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_short_put         max_elements = %d\n",max_elements);
  srce_short = shmem_malloc(max_elements_bytes);
  targ_short = shmem_malloc(max_elements_bytes);
  if((srce_short == NULL) || (targ_short == NULL))
    shmalloc_error();

   max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_long_put_nb       max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();

  max_elements = (size_t) (MAX_SIZE / sizeof(float));
  max_elements_bytes = (size_t) (sizeof(float)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_float_put_nb      max_elements = %d\n",max_elements);
  srce_float = shmem_malloc(max_elements_bytes);
  targ_float = shmem_malloc(max_elements_bytes);
  if((srce_float == NULL) || (targ_float == NULL))
    shmalloc_error();

  max_elements = (size_t) (MAX_SIZE / sizeof(double));
  max_elements_bytes = (size_t) (sizeof(double)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_double_put_nb     max_elements = %d\n",max_elements);
  srce_double = shmem_malloc(max_elements_bytes);
  targ_double = shmem_malloc(max_elements_bytes);
  if((srce_double == NULL) || (targ_double == NULL))
    shmalloc_error();

  if(my_pe == 0)
    fprintf(stderr,"Actual value used for   max_elements = %d\n",max_elements);
  /* try the different sizes MAX_ITER times */
  for (iter = 0; iter < MAX_ITER; iter++) {
   for (i = 0; i < num_elements; i++) {
    *flag = 0;
    if (elements[i] <= max_elements) {
     if ( (my_pe % 2) == 0 )
       for(j = 0; j < elements[i]; j++) {
         srce_short[j] = (short)(my_pe+j);
         srce_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j);
         srce_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j);
         srce_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j);
         srce_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j);
       }
     else
       for(j = 0; j < elements[i]; j++) {
         targ_short[j] = (short)(my_pe+j);
         targ_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j);
         targ_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j);
         targ_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j);
         targ_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j);
       }
     shmem_barrier_all();
     if ( (my_pe % 2) == 0 ) {
#ifndef OPENSHMEM
       shmemx_int_put_nb(targ_int,srce_int,elements[i],my_pe+1,NULL);
       shmemx_long_put_nb(targ_long,srce_long,elements[i],my_pe+1,NULL);
       shmemx_float_put_nb(targ_float,srce_float,elements[i],my_pe+1,NULL);
       shmemx_double_put_nb(targ_double,srce_double,elements[i],my_pe+1,NULL);
#else
       shmem_int_put_nbi(targ_int,srce_int,elements[i],my_pe+1);
       shmem_long_put_nbi(targ_long,srce_long,elements[i],my_pe+1);
       shmem_float_put_nbi(targ_float,srce_float,elements[i],my_pe+1);
       shmem_double_put_nbi(targ_double,srce_double,elements[i],my_pe+1);
#endif
       /* this one is blocking */
       shmem_short_put(targ_short,srce_short,elements[i],my_pe+1);
       shmem_quiet();
       shmem_int_put(flag,one,(size_t)1,my_pe+1);
     } else {
       shmem_int_wait(flag,0);
       for(j = 0; j < elements[i]; j++) {
         if ( targ_short[j] != (short)(my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_short[%d]=%d not equal %d\n",
              my_pe,iter,i,j,targ_short[j],my_pe+j-1);
         if ( targ_int[j] != (int)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr, 
           "FAIL: PE [%d] iter=%d i=%d targ_int[%d]=%d not equal %d\n",
              my_pe,iter,i,j,targ_int[j],iter*10000+elements[i]*100+my_pe+j-1);
         if ( targ_long[j] != (long)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%d not equal %d\n",
              my_pe,iter,i,j,targ_long[j],iter*10000+elements[i]*100+my_pe+j-1);
         if ( targ_float[j] != (float)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%f not equal %d\n",
              my_pe,iter,i,j,targ_float[j],iter*10000+elements[i]*100+my_pe+j-1);
         if ( targ_double[j] != (double)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_double[%d]=%f not equal %d\n",
              my_pe,iter,i,j,targ_double[j],iter*10000+elements[i]*100+my_pe+j-1);
         }
     }
    }
   }
  }
  shmem_free(srce_short);  shmem_free(targ_short);
  shmem_free(srce_int);  shmem_free(targ_int);
  shmem_free(srce_long);  shmem_free(targ_long);
  shmem_free(srce_float);  shmem_free(targ_float);
  shmem_free(srce_double);  shmem_free(targ_double);
#ifdef NEEDS_FINALIZE
  shmem_finalize(); 
#endif
  return 0;
}