예제 #1
0
파일: shmem.c 프로젝트: carriercomm/ix
void SendRepeat(ArgStruct *p, int rpt)
{
   *pNrepeat= rpt;

   shmem_int_put(pNrepeat,pNrepeat,1,p->prot.nbor);
   shmem_int_put(p->prot.flag,p->prot.flag,1,p->prot.nbor);
}
예제 #2
0
파일: spam.c 프로젝트: stjohnt/sandia-shmem
void
neighbor_put(int *target, int *src, int elements, int me, int npes, int loops)
{
    int i, neighbor_pe;
	double start_time, elapsed_time;
    long total_bytes = loops * elements * sizeof(*src);

    if (me==0 && Verbose) {
        fprintf(stdout, "%s: %d loops of put(%ld bytes) to neighbor, %d PEs: ",
                __FUNCTION__, loops, (elements*sizeof(*src)), npes);
        fflush(stdout);
    }

	shmem_barrier_all();

    neighbor_pe = (me + 1) % npes;

    start_time = shmemx_wtime();
    for(i = 0; i < loops; i++)
        shmem_int_put(target, src, elements, neighbor_pe);
    elapsed_time = shmemx_wtime() - start_time;

    if (me==0 && Verbose) {
        printf("%7.3f secs\n", elapsed_time);
        printf("  %7.5f usecs / put(), %ld Kbytes @ %7.4f MB/sec\n\n",
                (elapsed_time/((double)loops*npes))*1000000.0,
                (total_bytes/1024),
                ((double)total_bytes/(1024.0*1024.0)) / elapsed_time );
    }
	shmem_barrier_all();
}
예제 #3
0
파일: spam.c 프로젝트: stjohnt/sandia-shmem
void
one2many_put(int *target, int *src, int elements, int me, int npes, int loops)
{
    int i, pe;
	double start_time, elapsed_time;
    long total_bytes = loops * elements * sizeof(*src) * (npes - 1);

	if (me == 0) {
        fprintf(stdout,"%s: %d loops of put(%ld bytes) to %d PEs: ",
                __FUNCTION__, loops, (elements*sizeof(*src)), npes-1);
        fflush(stdout);
    }
	shmem_barrier_all();

	if (me == 0) {
	    start_time = shmemx_wtime();
	    for(i = 0; i < loops; i++) {
	        for(pe = 1; pe < npes; pe++)
	            shmem_int_put(target, src, elements, pe);
	    }
	    elapsed_time = shmemx_wtime() - start_time;

	    if (Verbose) {
            printf("%7.3f secs\n", elapsed_time);
            printf("  %7.5f usecs / put(), %ld Kbytes @ %7.4f MB/sec\n\n",
                (elapsed_time/((double)loops*(npes-1)))*1000000.0,
                (total_bytes/1024),
                ((double)total_bytes/(1024.0*1024.0)) / elapsed_time );
        }
    }
	shmem_barrier_all();
}
예제 #4
0
int
main (int argc, char **argv)
{
    int dest;
    int src;
    int me, npes;

    shmem_init ();

    me = shmem_my_pe ();
    npes = shmem_n_pes ();

    src = 42;

    shmem_barrier_all ();

    if (me == 0) {
        shmem_int_put (&dest, &src, 1, 1);
    }

    shmem_barrier_all ();

    shmem_finalize ();

    return 0;
}
int main(void)
{
  long source[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
  int src = 99;
  shmem_init();
  if (shmem_my_pe() == 0) {
    shmem_long_put(dest, source, 10, 1);  /*put1*/
    shmem_long_put(dest, source, 10, 2);  /*put2*/
    shmem_fence();
    shmem_int_put(&targ, &src, 1, 1);  /*put3*/
    shmem_int_put(&targ, &src, 1, 2);  /*put4*/
  }
  shmem_barrier_all();  /* sync sender and receiver */
  printf("dest[0] on PE %d is %ld\n", shmem_my_pe(), dest[0]);
  return 1;
}
예제 #6
0
파일: shmem.c 프로젝트: carriercomm/ix
void SendTime(ArgStruct *p, double *t)
{
   *pTime=*t;

   shmem_double_put(pTime,pTime,1,p->prot.nbor);
   shmem_int_put(p->prot.flag,p->prot.flag,1,p->prot.nbor);
}
예제 #7
0
/****************************************************************************
 * Place for Test Item functions
 ***************************************************************************/
static int test_item1(void)
{
    int rc = TC_PASS;
    int myPe = shmem_my_pe();
	int myPeer = myPe + ( myPe % 2 ? -1 : 1 ) ;
    int nPe = shmem_n_pes();
    int remainderPe = nPe - (nPe % 2);

    static int statArray[ARRAY_SIZE];
    int* dynamicArray = shmalloc( ARRAY_SIZE * sizeof(int) );

    int iterate;
    for (iterate = 0; iterate < ARRAY_SIZE; iterate++)
    {
        if (myPe != remainderPe)
        {
            int tryIterate;
            int putNum, getNum;

            for (tryIterate = 0; tryIterate < TRY_SIZE; tryIterate++)
            {
                putNum = iterate + myPe;
                shmem_int_put(&statArray[iterate], &putNum, 1, myPeer);
                shmem_int_put(&dynamicArray[iterate], &putNum, 1, myPeer);
            }

            shmem_fence();
            shmem_int_get(&getNum, &statArray[iterate], 1, myPeer);
            if (getNum != putNum)
            {
                rc = TC_FAIL;
            }

            shmem_int_get(&getNum, &dynamicArray[iterate], 1, myPeer);
            if (getNum != putNum)
            {
                rc = TC_FAIL;
            }
        }

        shmem_barrier_all();
    }

    shfree(dynamicArray);
    return rc;
}
예제 #8
0
void update_ghostcells ( int **buffer, int height, int width, int fhlimit,int localheight, int heightoffset )
{

    int i, j, k;

    shmem_barrier_all() ;

    if ( size < 2 ) {
        /* Nothing to be done */
        return;
    }

    if ( rank != 0 ) {
        if ( shmem_addr_accessible(&buffer[0][0], rank-1)) {
            if ( (rank - 1 ) == 0 )
            {
                for ( i = 0; i < 2; i++ ) shmem_int_get ( &(buffer[i][0]), &(buffer[i+(localheight-2)][0]), width, rank-1);
            }
            else {
                for ( i = 0; i < 2; i++ ) shmem_int_get ( &(buffer[i][0]), &(buffer[i+(localheight)][0]), width, rank-1);
            }
        }
        else {
            printf("Not_Accessible_Error_01");
        }

        if ( shmem_addr_accessible(&buffer[0][0], rank-1)) {
            if ( (rank - 1 ) == 0 )
            {
                for ( i = 0; i < 2; i++ ) shmem_int_put ( &(buffer[i+(localheight)][0]), &(buffer[i+heightoffset][0]), width, rank-1);
            }
            else {
                for ( i = 0; i < 2; i++ ) shmem_int_put ( &(buffer[i+(localheight)+2][0]), &(buffer[i+heightoffset][0]), width, rank-1);
            }
        }
        else {
            printf("Not_Accessible_Error_02");
        }
    }

    shmem_barrier_all ();
    return;
}
예제 #9
0
파일: isx.c 프로젝트: habanero-rice/hclib
static void my_turn_complete()
{
  const int my_rank = shmem_my_pe();
  const int next_rank = my_rank+1;

  if(my_rank < (NUM_PES-1)){ // Last rank updates no one
    shmem_int_put((int *) &whose_turn, &next_rank, 1, next_rank);
  }
  shmem_barrier_all();
}
예제 #10
0
int osh_basic_tc7(const TE_NODE *node, int argc, const char *argv[])
{
    int rc = TC_PASS;
    int me = _my_pe();
    int num_of_pes = _num_pes();
    //static int test_variable = 0;
    int value_to_set = 1;

    UNREFERENCED_PARAMETER(node);
    UNREFERENCED_PARAMETER(argc);
    UNREFERENCED_PARAMETER(argv);

    //just in case
    shmem_barrier_all();

    if (num_of_pes < 2)
    {
        rc = TC_SETUP_FAIL;
        goto FreeMemory;
    }

    //This test is for 2 ranks only
    if (me > 1)
    {
        goto FreeMemory;
    }

    if (0 == me)
    {
        int wait_time = 0;
        //wait for pe #1 to change my variable
        //and do some important work in the while
        while (!test_variable && wait_time < 5000000)
        {
            usleep(1000);
            wait_time += 1000;
        }

        if (!test_variable)
        {
            rc = TC_FAIL;
        }
    }
    else
    {
        shmem_int_put(&test_variable, &value_to_set, 1, 0);
    }

FreeMemory:
    shmem_barrier_all();

    return rc;
}
예제 #11
0
int
main ()
{
    shmem_init ();

    if (shmem_my_pe () == 0) {
        shmem_int_put (NULL, NULL, 1, 1);
    }

    shmem_barrier_all ();

    shmem_finalize ();

    return 0;
}
예제 #12
0
int
main (int argc, char *argv[])
{
    int *sray, *rray;
    int *sdisp, *scounts, *rdisp, *rcounts;
    int ssize, rsize, i, k, j;
    float z;

    init_it (&argc, &argv);
    scounts = (int *) shmem_malloc (sizeof (int) * numnodes);
    rcounts = (int *) shmem_malloc (sizeof (int) * numnodes);
    sdisp = (int *) shmem_malloc (sizeof (int) * numnodes);
    rdisp = (int *) shmem_malloc (sizeof (int) * numnodes);
    /* 
       ! seed the random number generator with a ! different number on each
       processor */
    seed_random (myid);
    /* find data to send */
    for (i = 0; i < numnodes; i++) {
        random_number (&z);
        scounts[i] = (int) (10.0 * z) + 1;
    }
    printf ("-------myid= %d scounts=", myid);
    for (i = 0; i < numnodes; i++) {
        printf ("%d ", scounts[i]);
    }
    printf ("\n");

    /* send the data */
    // mpi_err = MPI_Alltoall(scounts,1,MPI_INT, rcounts,1,MPI_INT,
    // MPI_COMM_WORLD);
    shmem_barrier_all ();
    int other, j1;
    for (j1 = 0; j1 < numnodes; j1++) {
        shmem_int_put (&rcounts[myid], &scounts[j1], 1, j1);
    }
    shmem_barrier_all ();
    printf ("myid= %d rcounts=", myid);
    for (i = 0; i < numnodes; i++) {
        printf ("%d ", rcounts[i]);
    }
    printf ("\n");
    shmem_finalize ();
    return 0;
}
예제 #13
0
int
main (int argc, char **argv)
{
  int nextpe;
  int me, npes;
  int src;
  int *dest;

  start_pes (0);
  me = shmem_my_pe ();
  npes = shmem_n_pes ();

  nextpe = (me + 1) % npes;

  src = nextpe;

  dest = (int *) shmalloc (sizeof (*dest)); /* symmetric */
  assert (dest != NULL);

  *dest = -1;
  shmem_barrier_all ();

  shmem_int_put (dest, & src, 1, nextpe);

  shmem_barrier_all ();

  printf ("%4d: got %4d: ", me, *dest);
  if (*dest == me)
    {
      printf ("CORRECT");
    }
  else
    {
      printf ("WRONG, expected %d", me);
    }
  printf ("\n");

  shmem_barrier_all ();

  shfree (dest);

  return 0;
}
예제 #14
0
int hyperquick(int *A, int N, int npes){
	int pivot;
	int i;
	//the step two of algo.....broadcast the new pivot
	
	//pivot = quicksort(A, 0, n-1);
	next_pivot = A[N/2]; //the median	
	//shmem_barrier_all();
	//printf("(%d) N= %d\n",me,N);
	shmem_broadcast32(&next_pivot,&next_pivot,1,0,0,0,npes,pSync);	
	shmem_barrier_all();	
	
	/*printf("Process %d the pivot:%d",me, pivot);
	shmem_barrier_all(); //just for the sake of clear display...can be removed in the end
	printf("\nThe sorted list is of process %d: ",me);
	for(i=0;i<N/npes;i++){
		printf("%d,  ",A[i]);
		}
	printf("\n");*/
	
	printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots
	
	int check,j; //to check the division of the sorted arrays according to the new pivot.
	shmem_barrier_all();
	check = uplowPartition(A, next_pivot, N*npes, npes);
	shmem_barrier_all();	
	printf("(%d)",me);	
	for(int j=0;j<N;j++){
		printf("%d, ",A[j]);
		}
	printf("new partition: %d",check);
	
	shmem_barrier_all();	
	printf("\n");
	
	if(me < npes/2){
		printf("\n");
		pe = me +npes/2;
		nelems[0] = N - check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value

		printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value

		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,&A[check],nelems[0],pe);
	}

	shmem_barrier_all();//check if the entire barrier is needed
	if(me >= npes/ 2){
		
		pe = me-npes/2;//check if it is synced
		nelems[0]= check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value
		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,A,nelems[0],pe);
	}
	
	shmem_barrier_all();//again sync is required...check it with profiling
//this snippet is to check if the processors have got the high and low lists respectively	-------------------
		printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value
                printf("(%d) new elements = ",me);
                for(i=0;i<nelems_import[0];i++){
                        printf("%d, ",temp_arr[i]);
                }
                printf("\n");
//------------------------------------here this checking snippet ends----

//----------------------------------merging of arrays begin-------------------------
	if(me < npes/2){
		i=0;
		for(j=nelems_import[0];j<(nelems_import[0]+check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}
	

	if(me >= npes/2){
		i=check;
		for(j=nelems_import[0];j<(nelems_import[0]+N-check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}

	shmem_barrier_all(); //to test if the arrays are merged properly
	int size;
	if(me < npes/2){	
		size = (nelems_import[0]+check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
		printf("\n");
	}
		
	if(me >= npes/2){
		size = (nelems_import[0]+N-check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
	printf("\n");
	}
			//-----------------------check of merging finishes--------
//--------------------------------------------------merging finishes------------------------------

//-----------------------sort again-----------------------------------------------	
	if(me < npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+check-1));	
	}
	
	if(me >= npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+N-check-1));
	}
	//sorting routine checked...once program is done we can remove this part-------------
	shmem_barrier_all();//test purpose only
	if(me < npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
			A[i] = temp_arr[i];
		}
		printf("\n");
	}
	
	
	if(me >= npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
			A[i] = temp_arr[i];
		}
		printf("\n");
	}
	//-------------------------------------------------------------
//---------------------------------------------------------------------------------

	//hyperquick(A,size,npes/2);
	
}
예제 #15
0
void
shmemi_broadcast32_tree (void *target, const void *source,
                         size_t nlong,
                         int PE_root, int PE_start,
                         int logPE_stride, int PE_size, long *pSync)
{
    int child_l, child_r, parent;
    const int step = 1 << logPE_stride;
    int my_pe = GET_STATE (mype);
    int *target_ptr, *source_ptr;
    int no_children;
    long is_ready, lchild_ready, rchild_ready;


    is_ready = 1;
    lchild_ready = -1;
    rchild_ready = -1;

    shmem_long_wait_until (&pSync[0], SHMEM_CMP_EQ, SHMEM_SYNC_VALUE);
    shmem_long_wait_until (&pSync[1], SHMEM_CMP_EQ, SHMEM_SYNC_VALUE);

    pSync[0] = 0;
    pSync[1] = 0;

    target_ptr = (int *) target;
    source_ptr = (int *) source;

    set_2tree (PE_start, step, PE_size, &parent, &child_l, &child_r, my_pe);
    no_children = 0;
    build_tree (PE_start, step, PE_root, PE_size,
                &parent, &child_l, &child_r, my_pe);
    shmemi_trace (SHMEM_LOG_BROADCAST,
                  "before broadcast, R_child = %d L_child = %d",
                  child_r, child_l);
    /* The actual broadcast */

    if (PE_size > 1) {
        if (my_pe == (PE_start + step * PE_root)) {
            pSync[0] = SHMEM_SYNC_VALUE;

            if (child_l != -1) {
                shmem_long_get (&lchild_ready, (const long *) &pSync[0],
                                1, child_l);
                while (lchild_ready != 0)
                    shmem_long_get (&lchild_ready, &pSync[0], 1, child_l);

                shmem_int_put (target_ptr, source_ptr, nlong, child_l);
                shmem_fence ();
                shmem_long_put (&pSync[0], &is_ready, 1, child_l);
                no_children = 1;
            }
            if (child_r != -1) {
                shmem_long_get (&rchild_ready, &pSync[0], 1, child_r);
                while (rchild_ready != 0)
                    shmem_long_get (&rchild_ready, &pSync[0], 1, child_r);

                shmem_int_put (target_ptr, source_ptr, nlong, child_r);
                shmem_fence ();
                shmem_long_put (&pSync[0], &is_ready, 1, child_r);
                no_children = 2;
            }

            shmem_long_wait_until (&pSync[1], SHMEM_CMP_EQ,
                                   (long) no_children);
            pSync[1] = SHMEM_SYNC_VALUE;

        }
        else {
            shmem_long_wait_until (&pSync[0], SHMEM_CMP_EQ, is_ready);
            pSync[0] = SHMEM_SYNC_VALUE;
            shmemi_trace (SHMEM_LOG_BROADCAST, "inside else");
            memcpy (source_ptr, target_ptr, nlong * sizeof (int));
            if (child_l != -1) {
                shmem_long_get (&lchild_ready, &pSync[0], 1, child_l);
                while (lchild_ready != 0)
                    shmem_long_get (&lchild_ready, &pSync[0], 1, child_l);

                shmem_int_put (target_ptr, source_ptr, nlong, child_l);
                shmem_fence ();
                shmem_long_put (&pSync[0], &is_ready, 1, child_l);
                no_children = 1;
            }
            if (child_r != -1) {
                shmem_long_get (&rchild_ready, &pSync[0], 1, child_r);
                while (rchild_ready != 0)
                    shmem_long_get (&rchild_ready, &pSync[0], 1, child_r);

                shmem_int_put (target_ptr, source_ptr, nlong, child_r);
                shmem_fence ();
                shmem_long_put (&pSync[0], &is_ready, 1, child_r);
                no_children = 2;
            }
            pSync[0] = SHMEM_SYNC_VALUE;

            if (no_children == 0) {
                pSync[1] = SHMEM_SYNC_VALUE;
                /* TO DO: Is check for parents pSync required? */
                shmem_long_inc (&pSync[1], parent);
            }
            else {
                shmem_long_wait_until (&pSync[1], SHMEM_CMP_EQ,
                                       (long) no_children);
                pSync[1] = SHMEM_SYNC_VALUE;
                /* printf("PE %d incrementing child count on PE
                   %d\n",my_pe,parent); */
                shmem_long_inc (&pSync[1], parent);
            }
        }
        shmemi_trace (SHMEM_LOG_BROADCAST, "at the end of bcast32");
        /* shmem_barrier(PE_start, logPE_stride, PE_size, pSync); */
    }
}
int
main (int argc, char **argv)
{
    int i;
    int nextpe;
    int me, npes;
    int success1, success2, success3, success4, success5, success6, success7,
        success8;

    short src1[N];
    int src2[N];
    long src3[N];
    long double src4[N];
    long long src5[N];
    double src6[N];
    float src7[N];
    char *src8;
    short src9;
    int src10;
    long src11;
    double src12;
    float src13;

    short *dest1;
    int *dest2;
    long *dest3;
    long double *dest4;
    long long *dest5;
    double *dest6;
    float *dest7;
    char *dest8;
    short *dest9;
    int *dest10;
    long *dest11;
    double *dest12;
    float *dest13;


    shmem_init ();
    me = shmem_my_pe ();
    npes = shmem_n_pes ();

    if (npes > 1) {

        success1 = 0;
        success2 = 0;
        success3 = 0;
        success4 = 0;
        success5 = 0;
        success6 = 0;
        success7 = 0;
        success8 = 0;
        src8 = (char *) malloc (N * sizeof (char));

        for (i = 0; i < N; i += 1) {
            src1[i] = (short) me;
            src2[i] = me;
            src3[i] = (long) me;
            src4[i] = (long double) me;
            src5[i] = (long long) me;
            src6[i] = (double) me;
            src7[i] = (float) me;
            src8[i] = (char) me;
        }
        src9 = (short) me;
        src10 = me;
        src11 = (long) me;
        src12 = (double) me;
        src13 = (float) me;


        dest1 = (short *) shmem_malloc (N * sizeof (*dest1));
        dest2 = (int *) shmem_malloc (N * sizeof (*dest2));
        dest3 = (long *) shmem_malloc (N * sizeof (*dest3));
        dest4 = (long double *) shmem_malloc (N * sizeof (*dest4));
        dest5 = (long long *) shmem_malloc (N * sizeof (*dest5));
        dest6 = (double *) shmem_malloc (N * sizeof (*dest6));
        dest7 = (float *) shmem_malloc (N * sizeof (*dest7));
        dest8 = (char *) shmem_malloc (4 * sizeof (*dest8));
        dest9 = (short *) shmem_malloc (sizeof (*dest9));
        dest10 = (int *) shmem_malloc (sizeof (*dest10));
        dest11 = (long *) shmem_malloc (sizeof (*dest11));
        dest12 = (double *) shmem_malloc (sizeof (*dest12));
        dest13 = (float *) shmem_malloc (sizeof (*dest13));

        for (i = 0; i < N; i += 1) {
            dest1[i] = -9;
            dest2[i] = -9;
            dest3[i] = -9;
            dest4[i] = -9;
            dest5[i] = -9;
            dest6[i] = -9;
            dest7[i] = -9.0;
            dest8[i] = -9;
        }
        *dest9 = -9;
        *dest10 = -9;
        *dest11 = -9;
        *dest12 = -9;
        *dest13 = -9.0;

        nextpe = (me + 1) % npes;

        /* Testing shmem_short_put, shmem_int_put, shmem_long_put,
           shmem_longdouble_put, shmem_longlong_put, shmem_double_put,
           shmem_float_put, shmem_putmem */
        shmem_barrier_all ();

        shmem_short_put (dest1, src1, N, nextpe);
        shmem_int_put (dest2, src2, N, nextpe);
        shmem_long_put (dest3, src3, N, nextpe);
        shmem_longdouble_put (dest4, src4, N, nextpe);
        shmem_longlong_put (dest5, src5, N, nextpe);
        shmem_double_put (dest6, src6, N, nextpe);
        shmem_float_put (dest7, src7, N, nextpe);
        shmem_putmem (dest8, src8, N * sizeof (char), nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            for (i = 0; i < N; i += 1) {
                if (dest1[i] != (npes - 1)) {
                    success1 = 1;
                }
                if (dest2[i] != (npes - 1)) {
                    success2 = 1;
                }
                if (dest3[i] != (npes - 1)) {
                    success3 = 1;
                }
                if (dest4[i] != (npes - 1)) {
                    success4 = 1;
                }
                if (dest5[i] != (npes - 1)) {
                    success5 = 1;
                }
                if (dest6[i] != (npes - 1)) {
                    success6 = 1;
                }
                if (dest7[i] != (npes - 1)) {
                    success7 = 1;
                }
                if (dest8[i] != (npes - 1)) {
                    success8 = 1;
                }
            }

            if (success1 == 0)
                printf ("Test shmem_short_put: Passed\n");
            else
                printf ("Test shmem_short_put: Failed\n");
            if (success2 == 0)
                printf ("Test shmem_int_put: Passed\n");
            else
                printf ("Test shmem_int_put: Failed\n");
            if (success3 == 0)
                printf ("Test shmem_long_put: Passed\n");
            else
                printf ("Test shmem_long_put: Failed\n");
            if (success4 == 0)
                printf ("Test shmem_longdouble_put: Passed\n");
            else
                printf ("Test shmem_longdouble_put: Failed\n");
            if (success5 == 0)
                printf ("Test shmem_longlong_put: Passed\n");
            else
                printf ("Test shmem_longlong_put: Failed\n");
            if (success6 == 0)
                printf ("Test shmem_double_put: Passed\n");
            else
                printf ("Test shmem_double_put: Failed\n");
            if (success7 == 0)
                printf ("Test shmem_float_put: Passed\n");
            else
                printf ("Test shmem_float_put: Failed\n");
            if (success8 == 0)
                printf ("Test shmem_putmem: Passed\n");
            else
                printf ("Test shmem_putmem: Failed\n");

        }
        shmem_barrier_all ();

        /* Testing shmem_put32, shmem_put64, shmem_put128 */
        if (sizeof (int) == 4) {
            for (i = 0; i < N; i += 1) {
                dest2[i] = -9;
                dest3[i] = -9;
                dest4[i] = -9;
            }
            success2 = 0;
            success3 = 0;
            success4 = 0;

            shmem_barrier_all ();

            shmem_put32 (dest2, src2, N, nextpe);
            shmem_put64 (dest3, src3, N, nextpe);
            shmem_put128 (dest4, src4, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N; i += 1) {
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }
                    if (dest4[i] != (npes - 1)) {
                        success4 = 1;
                    }
                }
                if (success2 == 0)
                    printf ("Test shmem_put32: Passed\n");
                else
                    printf ("Test shmem_put32: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_put64: Passed\n");
                else
                    printf ("Test shmem_put64: Failed\n");

                if (success4 == 0)
                    printf ("Test shmem_put128: Passed\n");
                else
                    printf ("Test shmem_put128: Failed\n");
            }
        }
        else if (sizeof (int) == 8) {
            for (i = 0; i < N; i += 1) {
                dest1[i] = -9;
                dest2[i] = -9;
                dest3[i] = -9;
            }
            success1 = 0;
            success2 = 0;
            success3 = 0;

            shmem_barrier_all ();

            shmem_put32 (dest1, src1, N, nextpe);
            shmem_put64 (dest2, src2, N, nextpe);
            shmem_put128 (dest3, src3, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N; i += 1) {
                    if (dest1[i] != (npes - 1)) {
                        success1 = 1;
                    }
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }

                }
                if (success1 == 0)
                    printf ("Test shmem_put32: Passed\n");
                else
                    printf ("Test shmem_put32: Failed\n");
                if (success2 == 0)
                    printf ("Test shmem_put64: Passed\n");
                else
                    printf ("Test shmem_put64: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_put128: Passed\n");
                else
                    printf ("Test shmem_put128: Failed\n");
            }
        }

        /* Testing shmem_iput32, shmem_iput64, shmem_iput128 */
        shmem_barrier_all ();
        if (sizeof (int) == 4) {
            for (i = 0; i < N; i += 1) {
                dest2[i] = -9;
                dest3[i] = -9;
                dest4[i] = -9;
            }
            success2 = 0;
            success3 = 0;
            success4 = 0;

            shmem_barrier_all ();

            shmem_iput32 (dest2, src2, 1, 2, N, nextpe);
            shmem_iput64 (dest3, src3, 1, 2, N, nextpe);
            shmem_iput128 (dest4, src4, 1, 2, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N / 2; i += 1) {
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }
                    if (dest4[i] != (npes - 1)) {
                        success4 = 1;
                    }
                }
                if (success2 == 0)
                    printf ("Test shmem_iput32: Passed\n");
                else
                    printf ("Test shmem_iput32: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_iput64: Passed\n");
                else
                    printf ("Test shmem_iput64: Failed\n");

                if (success4 == 0)
                    printf ("Test shmem_iput128: Passed\n");
                else
                    printf ("Test shmem_iput128: Failed\n");
            }
        }
        else if (sizeof (int) == 8) {
            for (i = 0; i < N; i += 1) {
                dest1[i] = -9;
                dest2[i] = -9;
                dest3[i] = -9;
            }
            success1 = 0;
            success2 = 0;
            success3 = 0;

            shmem_barrier_all ();

            shmem_iput32 (dest1, src1, 1, 2, N, nextpe);
            shmem_iput64 (dest2, src2, 1, 2, N, nextpe);
            shmem_iput128 (dest3, src3, 1, 2, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N / 2; i += 1) {
                    if (dest1[i] != (npes - 1)) {
                        success1 = 1;
                    }
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }

                }
                if (success1 == 0)
                    printf ("Test shmem_iput32: Passed\n");
                else
                    printf ("Test shmem_iput32: Failed\n");
                if (success2 == 0)
                    printf ("Test shmem_iput64: Passed\n");
                else
                    printf ("Test shmem_iput64: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_iput128: Passed\n");
                else
                    printf ("Test shmem_iput128: Failed\n");
            }
        }

        /* Testing shmem_short_iput, shmem_int_iput, shmem_long_iput,
           shmem_double_iput, shmem_float_iput */
        for (i = 0; i < N; i += 1) {
            dest1[i] = -9;
            dest2[i] = -9;
            dest3[i] = -9;
            dest6[i] = -9;
            dest7[i] = -9;
        }
        success1 = 0;
        success2 = 0;
        success3 = 0;
        success6 = 0;
        success7 = 0;

        shmem_barrier_all ();

        shmem_short_iput (dest1, src1, 1, 2, N, nextpe);
        shmem_int_iput (dest2, src2, 1, 2, N, nextpe);
        shmem_long_iput (dest3, src3, 1, 2, N, nextpe);
        shmem_double_iput (dest6, src6, 1, 2, N, nextpe);
        shmem_float_iput (dest7, src7, 1, 2, N, nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            for (i = 0; i < N / 2; i += 1) {
                if (dest1[i] != (npes - 1)) {
                    success1 = 1;
                }
                if (dest2[i] != (npes - 1)) {
                    success2 = 1;
                }
                if (dest3[i] != (npes - 1)) {
                    success3 = 1;
                }
                if (dest6[i] != (npes - 1)) {
                    success6 = 1;
                }
                if (dest7[i] != (npes - 1)) {
                    success7 = 1;
                }
            }

            if (success1 == 0)
                printf ("Test shmem_short_iput: Passed\n");
            else
                printf ("Test shmem_short_iput: Failed\n");
            if (success2 == 0)
                printf ("Test shmem_int_iput: Passed\n");
            else
                printf ("Test shmem_int_iput: Failed\n");
            if (success3 == 0)
                printf ("Test shmem_long_iput: Passed\n");
            else
                printf ("Test shmem_long_iput: Failed\n");
            if (success6 == 0)
                printf ("Test shmem_double_iput: Passed\n");
            else
                printf ("Test shmem_double_iput: Failed\n");
            if (success7 == 0)
                printf ("Test shmem_float_iput: Passed\n");
            else
                printf ("Test shmem_float_iput: Failed\n");

        }


        /* Testing shmem_double_p, shmem_float_p, shmem_int_p, shmem_long_p,
           shmem_short_p */
        shmem_barrier_all ();

        shmem_short_p (dest9, src9, nextpe);
        shmem_int_p (dest10, src10, nextpe);
        shmem_long_p (dest11, src11, nextpe);
        shmem_double_p (dest12, src12, nextpe);
        shmem_float_p (dest13, src13, nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            if (*dest9 == (npes - 1))
                printf ("Test shmem_short_p: Passed\n");
            else
                printf ("Test shmem_short_p: Failed\n");
            if (*dest10 == (npes - 1))
                printf ("Test shmem_int_p: Passed\n");
            else
                printf ("Test shmem_int_p: Failed\n");
            if (*dest11 == (npes - 1))
                printf ("Test shmem_long_p: Passed\n");
            else
                printf ("Test shmem_long_p: Failed\n");
            if (*dest12 == (npes - 1))
                printf ("Test shmem_double_p: Passed\n");
            else
                printf ("Test shmem_double_p: Failed\n");
            if (*dest13 == (npes - 1))
                printf ("Test shmem_float_p: Passed\n");
            else
                printf ("Test shmem_float_p: Failed\n");


        }

        shmem_barrier_all ();

        shmem_free (dest1);
        shmem_free (dest2);
        shmem_free (dest3);
        shmem_free (dest4);
        shmem_free (dest5);
        shmem_free (dest6);
        shmem_free (dest7);
        shmem_free (dest8);
        shmem_free (dest9);
        shmem_free (dest10);
        shmem_free (dest11);
        shmem_free (dest12);
        shmem_free (dest13);

    }
    else {
        printf ("Number of PEs must be > 1 to test shmem put, test skipped\n");
    }

    shmem_finalize ();

    return 0;
}
예제 #17
0
int
main (int argc, char *argv[])
{
    int *sray, *rray;
    int *sdisp, *scounts, *rdisp, *rcounts;
    int ssize, rsize, i, k, j;
    float z;

    init_it (&argc, &argv);
    scounts = (int *) shmem_malloc (sizeof (int) * numnodes);
    rcounts = (int *) shmem_malloc (sizeof (int) * numnodes);
    sdisp = (int *) shmem_malloc (sizeof (int) * numnodes);
    rdisp = (int *) shmem_malloc (sizeof (int) * numnodes);
    /* 
       ! seed the random number generator with a ! different number on each
       processor */
    seed_random (myid);
    /* find out how much data to send */
    for (i = 0; i < numnodes; i++) {
        random_number (&z);
        scounts[i] = (int) (5.0 * z) + 1;
    }
    printf ("myid= %d scounts=%d %d %d %d\n", myid, scounts[0], scounts[1],
            scounts[2], scounts[3]);
    /* tell the other processors how much data is coming */
    // mpi_err = MPI_Alltoall(scounts,1,MPI_INT, rcounts,1,MPI_INT,
    // MPI_COMM_WORLD);
    shmem_barrier_all ();
    int other, j1;
    for (j1 = 0; j1 < numnodes; j1++) {
        shmem_int_put (&rcounts[myid], &scounts[j1], 1, j1);
    }
    shmem_barrier_all ();
    printf ("-----myid= %d rcounts=", myid);
    for (i = 0; i < numnodes; i++) {
        printf ("%d ", rcounts[i]);
    }
    printf ("\n");


    /* write(*,*)"myid= ",myid," rcounts= ",rcounts */
    /* calculate displacements and the size of the arrays */
    sdisp[0] = 0;
    for (i = 1; i < numnodes; i++) {
        sdisp[i] = scounts[i - 1] + sdisp[i - 1];
    }
    rdisp[0] = 0;
    for (i = 1; i < numnodes; i++) {
        rdisp[i] = rcounts[i - 1] + rdisp[i - 1];
    }
    ssize = 0;
    rsize = 0;
    for (i = 0; i < numnodes; i++) {
        ssize = ssize + scounts[i];
        rsize = rsize + rcounts[i];
    }

    /* allocate send and rec arrays */
    sray = (int *) shmem_malloc (sizeof (int) * 20);
    rray = (int *) shmem_malloc (sizeof (int) * 20);
    for (i = 0; i < ssize; i++) {
        sray[i] = myid;
    }
    /* send/rec different amounts of data to/from each processor */
    // mpi_err = MPI_Alltoallv(sray,scounts,sdisp,MPI_INT,
    // rray,rcounts,rdisp,MPI_INT, MPI_COMM_WORLD);
    shmem_barrier_all ();
    for (j1 = 0; j1 < numnodes; j1++) {
        int k1 = sdisp[j1];
        static int k2;
        shmem_int_get (&k2, &rdisp[myid], 1, j1);
        shmem_int_put (rray + k2, sray + k1, scounts[j1], j1);
    }
    shmem_barrier_all ();

    printf ("myid= %d rray=", myid);
    for (i = 0; i < rsize; i++) {
        printf ("%d ", rray[i]);
    }
    printf ("\n");
    // mpi_err = MPI_Finalize();
    shmem_finalize ();
    return 0;
}
예제 #18
0
/* main function */
int main(int argc, char **argv)
{
    int localheight=0;
    int realheight=0;
    int height, width,i, j, k, heightoffset;
    int **labels=NULL;
    FILE *fpa;

    /* initializing shmem values */
    start_pes (0);
    size = _num_pes ();
    rank = _my_pe ();

    /* determining inputs */
    if (argc != 5 ) {
        printf("usage: %s <inputfile> <outputfile> <height> <width>\n", argv[0]);
        exit(0);
    }
    height = atoi ( argv[3]);
    width  = atoi ( argv[4]);

    /* creating symmetric memory */
    labels = (int **) shmalloc (height*sizeof(int *));
    for ( i = 0; i < height; i++ )
    {
        labels[i] = (int*) shmalloc (width*sizeof(int*));
    }

    allocate_2D_int_matrix ( &labels, height, width );

    /* reading from the input file */
    if ( rank == 0 )
    {
        fpa = fopen ( argv[1], "rb");
        fread ( &(labels[0][0]), sizeof(int), height*width, fpa);
        fclose ( fpa );
    }

    /* datapartitioning between the processes */
    localheight = height/size;
    realheight  = localheight + 4 ;
    heightoffset = 2;
    if ( rank == 0 ) {
        realheight -= 2 ;
        heightoffset = 0;
    }
    if ( rank == (size -1) )  {
        realheight -= 2;
    }

    shmem_barrier_all ();
    /* distributing input datum to all the symmetric variables */
    for ( i=1; i<size; i++ ) {
        if ( rank == i )
            if ( shmem_addr_accessible(&labels[0][0], 0)) {
                if ( rank == ( size -1 ) ) {
                    k = 0;
                }
                else {
                    k = 2;
                }
                for ( j = 0; j < localheight+k; j++ ) {
                    shmem_int_get(&labels[j+heightoffset][0], &labels[j+(localheight*rank)][0], width, 0);
                }
            }
            else {
                printf("Not_accessible_Error_00");
            }
    }

    /* gray erode calculation */
    shmem_barrier_all();
    gettimeofday(&start, NULL);
    gray_erode(labels, height, width, FILTERHEIGHT,FILTERWIDTH, ITERATIONS, SEGMENTS, localheight, heightoffset );
    gettimeofday(&end, NULL);

    /* calculating and displaying time taken */
    if( rank == 0 )
    {
        t = (double) ((end.tv_sec * 1000000 + end.tv_usec) - (start.tv_sec * 1000000 + start.tv_usec)) / 1000000.0;
        printf("Total Time%f \n",t);
    }

    /* transferring datum back to root */
    shmem_barrier_all ();
    for ( k = 0; k < size; k++) {
        if ( rank == k ) {
            for ( i = 0; i < localheight; i++ ) {
                shmem_int_put(&labels[i+(localheight*rank)][0], &labels[i+heightoffset][0], width, 0);
            }
        }
    }

    shmem_barrier_all ();
    /* creating output file */
    if(rank == 0)	{
        fpa = fopen ( argv[2], "wb");
        fwrite( &(labels[0][0]), sizeof(int), height*width, fpa );
        fclose( fpa );
    }

    free_2D_int_matrix ( &labels);
    return 0;
}
예제 #19
0
파일: bigput.c 프로젝트: richardnorth3/SOS
int
main(int argc, char **argv)
{
    int loops=DFLT_LOOPS;
    char *pgm;
    int *Target;
    int *Source;
    int i, me, npes;
    int target_PE;
    long bytes;
    double start_time, *total_time;

    shmem_init();
    me = shmem_my_pe();
    npes = shmem_n_pes();

    if ((pgm=strrchr(argv[0],'/')))
        pgm++;
    else
        pgm = argv[0];

    while ((i = getopt (argc, argv, "hve:l:st")) != EOF) {
        switch (i)
        {
            case 'v':
                Verbose++;
                break;
            case 'e':
                if ((elements = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad elements count %d\n",elements);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 'l':
                if ((loops = atoi_scaled(optarg)) <= 0) {
                    fprintf(stderr,"ERR: Bad loop count %d\n",loops);
                    shmem_finalize();
                    return 1;
                }
                break;
            case 's':
                Sync++;
                break;
            case 't':
                Track++;
                break;
            case 'h':
                if (me == 0)
                    usage(pgm);
                return 0;
            default:
                if (me == 0) {
                    fprintf(stderr,"%s: unknown switch '-%c'?\n",pgm,i);
                    usage(pgm);
                }
                shmem_finalize();
                return 1;
        }
    }

    for(i=0; i < SHMEM_REDUCE_SYNC_SIZE; i++)
        pSync[i] = SHMEM_SYNC_VALUE;

    target_PE = (me+1) % npes;

    total_time = (double *) shmem_malloc( npes * sizeof(double) );
    if (!total_time) {
        fprintf(stderr,"ERR: bad total_time shmem_malloc(%ld)\n",
                (elements * sizeof(double)));
        shmem_global_exit(1);
    }
    for(i=0; i < npes; i++)
        total_time[i] = -1.0;

    Source = (int *) shmem_malloc( elements * sizeof(*Source) );
    if (!Source) {
        fprintf(stderr,"ERR: bad Source shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    Target = (int *) shmem_malloc( elements * sizeof(*Target) );
    if (!Target) {
        fprintf(stderr,"ERR: bad Target shmem_malloc(%ld)\n",
                (elements * sizeof(*Target)));
        shmem_free(Source);
        shmem_free(total_time);
        shmem_global_exit(1);
    }

    for (i = 0; i < elements; i++) {
        Target[i] = -90;
        Source[i] = i + 1;
    }

    bytes = loops * sizeof(int) * elements;

    if (Verbose && me==0) {
        fprintf(stderr,
                "%s: INFO - %d loops, put %d (int) elements to PE+1 Max put ??\n",
                pgm, loops, elements);
    }
    shmem_barrier_all();

    for(i=0; i < loops; i++) {

        start_time = shmemx_wtime();

        shmem_int_put(Target, Source, elements, target_PE);

        time_taken += (shmemx_wtime() - start_time);

        if (me==0) {
            if ( Track && i > 0 && ((i % 200) == 0))
                fprintf(stderr,".%d",i);
        }
        if (Sync)
            shmem_barrier_all();
    }

    // collect time per node.
    shmem_double_put( &total_time[me], &time_taken, 1, 0 );
    shmem_double_sum_to_all(&sum_time, &time_taken, 1, 0, 0, npes, pWrk, pSync);

    shmem_barrier_all();

    for (i = 0; i < elements; i++) {
        if (Target[i] != i + 1) {
            printf("%d: Error Target[%d] = %d, expected %d\n",
                   me, i, Target[i], i + 1);
            shmem_global_exit(1);
        }
    }

    if ( Track && me == 0 ) fprintf(stderr,"\n");

    if(Verbose && me == 0) {
        double rate, comp_time;

        if (Verbose > 1)
            fprintf(stdout,"Individule PE times: (seconds)\n");
        for(i=0,comp_time=0.0; i < npes; i++) {
            comp_time += total_time[i];
            if (Verbose > 1)
                fprintf(stdout,"  PE[%d] %8.6f\n",i,total_time[i]);
        }

        sum_time /= (double)npes;
        comp_time /= (double)npes;
        if (sum_time != comp_time)
            printf("%s: computed_time %7.5f != sum_to_all_time %7.5f)\n",
                   pgm, comp_time, sum_time );

        rate = ((double)bytes/(1024.0*1024.0)) / comp_time;
        printf("%s: shmem_int_put() %7.4f MB/sec (bytes %ld secs %7.4f)\n",
               pgm, rate, bytes, sum_time);
    }

    shmem_free(total_time);
    shmem_free(Target);
    shmem_free(Source);

    shmem_finalize();

    return 0;
}
예제 #20
0
/* Performance test for shmem_XX_put (latency and bandwidth) */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <shmem.h>
#include <sys/time.h>

long double time_taken;

long pSync[_SHMEM_REDUCE_SYNC_SIZE];
long double pWrk[_SHMEM_REDUCE_MIN_WRKDATA_SIZE];

//#define N_ELEMENTS 25600/*Data size chosen to be able to capture time required*/
  int
main(void)
{
  int i,j,k;
  int *target;
  int *source;
  int me, npes;
  int nxtpe;
  struct timeval start, end;
  long double start_time,end_time;

  int N_ELEMENTS = (4194304*2)/sizeof(int);

  start_pes(0);
  me = _my_pe();
  npes = _num_pes();

  for (i = 0; i < _SHMEM_BCAST_SYNC_SIZE; i += 1)
  {
    pSync[i] = _SHMEM_SYNC_VALUE;
  }
  nxtpe = (me+1)%npes;
  source = (int *) shmalloc( N_ELEMENTS * sizeof(*source) );
  target = (int *) shmalloc( N_ELEMENTS * sizeof(*target) );

  if(me == 0)
    printf("Put performance test results:\nSize (Bytes)\t\tTime (Microseconds)\t\tBandwidth (Bytes/Second)\n");

  for (i = 0; i < N_ELEMENTS; i += 1) {
    source[i] = i + 1;
    target[i] = -90;
  }
  shmem_barrier_all();

  /*For int put we take average of all the times realized by a pair of PEs, thus
   * reducing effects of physical location of PEs*/
  for (i=1;i<=N_ELEMENTS;i=i*2)
  {
    time_taken = 0;

    for(j=0;j<10000;j++){
      gettimeofday(&start, NULL);

      start_time = (start.tv_sec * 1000000.0) + start.tv_usec;

      shmem_int_put(target, source, i,nxtpe);

      gettimeofday(&end, NULL);

      end_time = (end.tv_sec * 1000000.0) + end.tv_usec;

      time_taken = time_taken + (end_time - start_time);

    }
    shmem_longdouble_sum_to_all(&time_taken, &time_taken,1, 0, 0, npes, pWrk, pSync);


    if(me == 0){
      time_taken = time_taken/(npes*10000); /*Average time across all PEs for one put*/
      if (i*sizeof(i) < 1048576)
        printf("%ld \t\t\t\t %lf\t\t\t\t %lf\n",i*sizeof(i),
               (double)time_taken,(double)((i*sizeof(i))/(time_taken)));
      else
        printf("%ld \t\t\t %lf\t\t\t\t %lf\n",i*sizeof(i),
               (double)time_taken,(double)((i*sizeof(i))/(time_taken)));

    }

  }
  shmem_barrier_all();

  shfree(target);
  shfree(source);
  return 0;
}
예제 #21
0
void _PERM_IR(_permmap* const pm) {
  const int one = 1;
  int * rindbase;
  int * lindbase;
  int * const restrict lsize = (int *)shmalloc(_PROCESSORS * sizeof(int));
  int * const restrict rsize = (int *)shmalloc(_PROCESSORS * sizeof(int));
  int * restrict * const restrict lind = pm->lind;
  int * restrict * const restrict rind = pm->rind;
  char * restrict * const restrict rptr = pm->rptr;
  int * const restrict rflag = pm->rflag;
  int* addr;
  int i, j;

  for (i = 0; i < _PROCESSORS; i++) {
    lsize[i] = lind[i] ? lind[i][0] : 0;
    rsize[i] = 0;
  }
  shmem_barrier_all();
  for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1;
       i != _INDEX;
       i = (i == _PROCESSORS - 1) ? 0 : i++) {
    if (lsize[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("%d sending count to %d\n", _INDEX, i);
  fflush(stdout);
#endif
      shmem_int_put(&(rsize[_INDEX]), &(lsize[i]), 1, i);
    }
  }
  rsize[_INDEX] = lsize[_INDEX];
  shmem_barrier_all();
#ifdef _SHMEM_PERMUTE_DEBUG
  sleep(_PROCESSORS);
#endif
  _PERM_CleanIndices(lsize, rsize, lind, rind, &lindbase, &rindbase);

#ifdef _SHMEM_PERMUTE_DEBUG
  sleep(_INDEX);
  printf("FROM PROCESSOR %d\n", _INDEX);
  printf("  LSIZE = ");
  for (i = 0; i < _PROCESSORS; i++) {
    printf("%d ", lsize[i]);
  }
  printf("\n");
  printf("  RSIZE = ");
  for (i = 0; i < _PROCESSORS; i++) {
    printf("%d ", rsize[i]);
  }
  printf("\n");

  printf("  PROCMAP: size = %d, # elts = %d, encoded = %d :: ", pm->procmap[0], pm->procmap[1], pm->procmap[2]);
   for (j = 3; j < pm->procmap[0]; j++) {
    printf("%d ", pm->procmap[j]);
  }
  printf("\n");
  for (i = 0; i < _PROCESSORS; i++) {
    if (lind[i] != 0) {
      printf("  TO PROCESSOR %d: ", i);
      printf("size = %d, # elts = %d, encoded = %d :: ", lind[i][0], lind[i][1], lind[i][2]);
      for (j = 3; j < lind[i][0]; j++) {
	printf("%d ", lind[i][j]);
      }
      printf("\n");
    }
  }
  printf("\n");
  fflush(stdout);
  sleep(_PROCESSORS-_INDEX);
#endif

  for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1;
       i != _INDEX;
       i = (i == _PROCESSORS - 1) ? 0 : i++) {
    if (rsize[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("%d sending rind address to %d\n", _INDEX, i);
  fflush(stdout);
#endif
      rflag[_INDEX] = 0;
      shmem_put((void*)&(rptr[_INDEX]), (void*)&(rind[i]), 1, i);
    }
  }
#ifdef _SHMEM_PERMUTE_DEBUG
  sleep(_PROCESSORS);
#endif
  for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1;
       i != _INDEX;
       i = (i == 0) ? _PROCESSORS-1 : i--) {
    if (lsize[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("%d waiting for rind address from %d, sending lind\n", _INDEX, i);
  fflush(stdout);
#endif
      shmem_wait((long*)&(rptr[i]), 0);
      addr = (int*)rptr[i];
      rptr[i] = 0;
      shmem_int_put(addr, lind[i], lsize[i], i);
    }
  }
#ifdef _SHMEM_PERMUTE_DEBUG
  sleep(_PROCESSORS);
#endif
  shmem_fence();
  for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1;
       i != _INDEX;
       i = (i == 0) ? _PROCESSORS-1 : i--) {
    if (lsize[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("IR %d sending one to %d\n", _INDEX, i);
  fflush(stdout);
#endif
      shmem_int_put(&(rflag[_INDEX]), &one, 1, i);
    }
  }
  if (lsize[_INDEX] > 0) {
    memcpy(rind[_INDEX], lind[_INDEX], lsize[_INDEX]*sizeof(int));
  }
  pm->lindbase = lindbase;
  pm->rindbase = rindbase;
  shfree(lsize);
  shfree(rsize);
}
int main(int argc, char **argv)
{
  int i,j,iter;
  int my_pe,n_pes;
  int *flag,*one;
  size_t max_elements,max_elements_bytes;
  size_t elements[16] = {1,2,4,8,12,16,24,32,64,128,256,512,1024,2048,4096,8192};
  int num_elements = 16;

  short *srce_short,*targ_short;
  int *srce_int,*targ_int;
  long *srce_long,*targ_long;
  float *srce_float,*targ_float;
  double *srce_double,*targ_double;

  shmem_init();
  my_pe = shmem_my_pe();
  n_pes = shmem_n_pes();
  flag = shmem_malloc((size_t) sizeof(int));
  one  = shmem_malloc((size_t) sizeof(int));
  *one  = 1;

/*  fail if trying to use odd number of processors  */
  if ( (n_pes % 2) != 0 ){
        fprintf(stderr, "FAIL - test requires even number of PEs\n");
        exit(1);
  }

  if(my_pe == 0)
    fprintf(stderr, "shmem_both_put_nb_size(%s)\n", argv[0]);

/*  alloc arrays   */

  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_int_put_nb        max_elements = %d\n",max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();

  max_elements = (size_t) (MAX_SIZE / sizeof(short));
  max_elements_bytes = (size_t) (sizeof(short)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_short_put         max_elements = %d\n",max_elements);
  srce_short = shmem_malloc(max_elements_bytes);
  targ_short = shmem_malloc(max_elements_bytes);
  if((srce_short == NULL) || (targ_short == NULL))
    shmalloc_error();

   max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_long_put_nb       max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();

  max_elements = (size_t) (MAX_SIZE / sizeof(float));
  max_elements_bytes = (size_t) (sizeof(float)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_float_put_nb      max_elements = %d\n",max_elements);
  srce_float = shmem_malloc(max_elements_bytes);
  targ_float = shmem_malloc(max_elements_bytes);
  if((srce_float == NULL) || (targ_float == NULL))
    shmalloc_error();

  max_elements = (size_t) (MAX_SIZE / sizeof(double));
  max_elements_bytes = (size_t) (sizeof(double)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_double_put_nb     max_elements = %d\n",max_elements);
  srce_double = shmem_malloc(max_elements_bytes);
  targ_double = shmem_malloc(max_elements_bytes);
  if((srce_double == NULL) || (targ_double == NULL))
    shmalloc_error();

  if(my_pe == 0)
    fprintf(stderr,"Actual value used for   max_elements = %d\n",max_elements);
  /* try the different sizes MAX_ITER times */
  for (iter = 0; iter < MAX_ITER; iter++) {
   for (i = 0; i < num_elements; i++) {
    *flag = 0;
    if (elements[i] <= max_elements) {
     if ( (my_pe % 2) == 0 )
       for(j = 0; j < elements[i]; j++) {
         srce_short[j] = (short)(my_pe+j);
         srce_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j);
         srce_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j);
         srce_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j);
         srce_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j);
       }
     else
       for(j = 0; j < elements[i]; j++) {
         targ_short[j] = (short)(my_pe+j);
         targ_int[j] = (int)(iter*10000+elements[i]*100+my_pe+j);
         targ_long[j] = (long)(iter*10000+elements[i]*100+my_pe+j);
         targ_float[j] = (float)(iter*10000+elements[i]*100+my_pe+j);
         targ_double[j] = (double)(iter*10000+elements[i]*100+my_pe+j);
       }
     shmem_barrier_all();
     if ( (my_pe % 2) == 0 ) {
#ifndef OPENSHMEM
       shmemx_int_put_nb(targ_int,srce_int,elements[i],my_pe+1,NULL);
       shmemx_long_put_nb(targ_long,srce_long,elements[i],my_pe+1,NULL);
       shmemx_float_put_nb(targ_float,srce_float,elements[i],my_pe+1,NULL);
       shmemx_double_put_nb(targ_double,srce_double,elements[i],my_pe+1,NULL);
#else
       shmem_int_put_nbi(targ_int,srce_int,elements[i],my_pe+1);
       shmem_long_put_nbi(targ_long,srce_long,elements[i],my_pe+1);
       shmem_float_put_nbi(targ_float,srce_float,elements[i],my_pe+1);
       shmem_double_put_nbi(targ_double,srce_double,elements[i],my_pe+1);
#endif
       /* this one is blocking */
       shmem_short_put(targ_short,srce_short,elements[i],my_pe+1);
       shmem_quiet();
       shmem_int_put(flag,one,(size_t)1,my_pe+1);
     } else {
       shmem_int_wait(flag,0);
       for(j = 0; j < elements[i]; j++) {
         if ( targ_short[j] != (short)(my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_short[%d]=%d not equal %d\n",
              my_pe,iter,i,j,targ_short[j],my_pe+j-1);
         if ( targ_int[j] != (int)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr, 
           "FAIL: PE [%d] iter=%d i=%d targ_int[%d]=%d not equal %d\n",
              my_pe,iter,i,j,targ_int[j],iter*10000+elements[i]*100+my_pe+j-1);
         if ( targ_long[j] != (long)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%d not equal %d\n",
              my_pe,iter,i,j,targ_long[j],iter*10000+elements[i]*100+my_pe+j-1);
         if ( targ_float[j] != (float)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_long[%d]=%f not equal %d\n",
              my_pe,iter,i,j,targ_float[j],iter*10000+elements[i]*100+my_pe+j-1);
         if ( targ_double[j] != (double)(iter*10000+elements[i]*100+my_pe+j-1) )
           fprintf(stderr,
           "FAIL: PE [%d] iter=%d i=%d targ_double[%d]=%f not equal %d\n",
              my_pe,iter,i,j,targ_double[j],iter*10000+elements[i]*100+my_pe+j-1);
         }
     }
    }
   }
  }
  shmem_free(srce_short);  shmem_free(targ_short);
  shmem_free(srce_int);  shmem_free(targ_int);
  shmem_free(srce_long);  shmem_free(targ_long);
  shmem_free(srce_float);  shmem_free(targ_float);
  shmem_free(srce_double);  shmem_free(targ_double);
#ifdef NEEDS_FINALIZE
  shmem_finalize(); 
#endif
  return 0;
}
int main(int argc, char **argv)
{
  const int ITER_CNT = 100;
  const long int MAX_MSG_SIZE = 1048576;
  int* source_addr;
  int peer;
  long int i=0, buff_size; 
  int j=0;
  long long int start_time, stop_time, res;
  double time;

  shmem_init();

  int pe_id = shmem_my_pe();
  source_addr = (int*) shmem_malloc(MAX_MSG_SIZE);

  if(pe_id == 1) {
      if(shmem_n_pes()!=4)
      	fprintf(stderr,"Num PEs should be ==4");
      printf("#Message Cnt;Time(s);MR(msgs/sec)\n");
  }

  if (pe_id==1)
	  peer = 3;
  else if(pe_id==3)
	  peer = 1;
  get_rtc_res_(&res);

  for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i += 1){
          pSync[i] = SHMEM_SYNC_VALUE;
  }

  /* Collective operation: Implicit barrier on return from attach */
  shmem_barrier_all();
  if(pe_id == 1 || pe_id == 3) {

  	for(buff_size=1; buff_size<=MAX_MSG_SIZE; buff_size*=2) {
	    isdone=0;
  	    shmem_barrier(1,1,2,pSync);
  	    get_rtc_(&start_time);
  	    for(j=1;j<=ITER_CNT;j++) {
		    shmem_putmem(source_addr, source_addr, buff_size, peer);
  	            shmem_quiet();
		    shmem_int_put(&isdone, &j, 1, peer);
  	            shmem_quiet();
		    shmem_int_wait(&isdone,j-1);
		    shmem_putmem(source_addr, source_addr, buff_size, peer);
  	            shmem_quiet();
  	    }
  	    shmem_barrier(1,1,2,pSync);
  	    get_rtc_(&stop_time);
  	    time = (stop_time - start_time)*1.0/(double)res/ITER_CNT;
  	    if(pe_id == 1) {
  	   	 printf("%20ld;%20.12f;%20.12f\n", 
  	                buff_size, time, (double)buff_size/time);
  	    }
  	    fflush(stdout);
  	}
  }

  shmem_barrier_all();
  shmem_finalize();

}
예제 #24
0
int
main ()
{
    int me, npes;

    int *dest1;
    float *dest2;
    long *dest3;
    double *dest4;
    long long *dest5;

    int swapped_val1, new_val1;
    float swapped_val2, new_val2;
    long swapped_val3, new_val3;
    double swapped_val4, new_val4;
    long long swapped_val5, new_val5;

    int success = 1;
    int success1_p1;
    int success2_p1;
    int success3_p1;
    int success4_p1;
    int success5_p1;

    shmem_init ();
    me = shmem_my_pe ();
    npes = shmem_n_pes ();

    shmem_barrier_all ();

    /* Checks if there are atleast 2 executing PEs */

    if (npes > 1) {

        dest1 = (int *) shmem_malloc (sizeof (*dest1));
        dest2 = (float *) shmem_malloc (sizeof (*dest2));
        dest3 = (long *) shmem_malloc (sizeof (*dest3));
        dest4 = (double *) shmem_malloc (sizeof (*dest4));
        dest5 = (long long *) shmem_malloc (sizeof (*dest5));

        *dest1 = *dest2 = *dest3 = *dest4 = *dest5 = me;
        new_val1 = new_val2 = new_val3 = new_val4 = new_val5 = me;
        success1_p1 = success1_p2 = success2_p1 = success2_p2 = success3_p1 =
            success3_p2 = success4_p1 = success4_p2 = success5_p1 =
            success5_p2 = -1;

        shmem_barrier_all ();

        swapped_val1 = shmem_int_swap (dest1, new_val1, (me + 1) % npes);
        swapped_val2 = shmem_float_swap (dest2, new_val2, (me + 1) % npes);
        swapped_val3 = shmem_long_swap (dest3, new_val3, (me + 1) % npes);
        swapped_val4 = shmem_double_swap (dest4, new_val4, (me + 1) % npes);
        swapped_val5 = shmem_longlong_swap (dest5, new_val5, (me + 1) % npes);


        /* To validate the working of swap we need to check the value received
           at the PE that initiated the swap as well as the dest PE */

        if (me == 0) {
            if (swapped_val1 == 1) {
                success1_p1 = 1;
            }
            if (swapped_val2 == 1) {
                success2_p1 = 1;
            }
            if (swapped_val3 == 1) {
                success3_p1 = 1;
            }
            if (swapped_val4 == 1) {
                success4_p1 = 1;
            }
            if (swapped_val5 == 1) {
                success5_p1 = 1;
            }
        }

        if (me == 1) {
            if (*dest1 == 0) {
                shmem_int_put (&success1_p2, &success, 1, 0);
            }
            if (*dest2 == 0) {
                shmem_int_put (&success2_p2, &success, 1, 0);
            }
            if (*dest3 == 0) {
                shmem_int_put (&success3_p2, &success, 1, 0);
            }
            if (*dest4 == 0) {
                shmem_int_put (&success4_p2, &success, 1, 0);
            }
            if (*dest5 == 0) {
                shmem_int_put (&success5_p2, &success, 1, 0);
            }
        }

        shmem_barrier_all ();

        if (me == 0) {
            if (success1_p1 && success1_p2) {
                printf ("Test shmem_int_swap: Passed\n");
            }
            else {
                printf ("Test shmem_int_swap: Failed\n");
            }

            if (success2_p1 && success2_p2) {
                printf ("Test shmem_float_swap: Passed\n");
            }
            else {
                printf ("Test shmem_float_swap: Failed\n");
            }

            if (success3_p1 && success3_p2) {
                printf ("Test shmem_long_swap: Passed\n");
            }
            else {
                printf ("Test shmem_long_swap: Failed\n");
            }

            if (success4_p1 && success4_p2) {
                printf ("Test shmem_double_swap: Passed\n");
            }
            else {
                printf ("Test shmem_double_swap: Failed\n");
            }

            if (success5_p1 && success5_p2) {
                printf ("Test shmem_longlong_swap: Passed\n");
            }
            else {
                printf ("Test shmem_longlong_swap: Failed\n");
            }

        }
        shmem_barrier_all ();



        /* Test conditional swaps shmem_longlong_cswap, shmem_long_cswap,
           shmem_int_cswap, */

        *dest1 = *dest3 = *dest5 = me;
        new_val1 = new_val3 = new_val5 = me;
        success1_p1 = success1_p2 = success3_p1 = success3_p2 = success5_p1 =
            success5_p2 = -1;

        shmem_barrier_all ();

        swapped_val1 = shmem_int_cswap (dest1, me + 1, (long) me, 1);
        swapped_val3 = shmem_long_cswap (dest3, me + 1, (long) me, 1);
        swapped_val5 = shmem_longlong_cswap (dest5, me + 1, (long) me, 1);


        /* To validate the working of conditionalswap we need to check the
           value received at the PE that initiated the conditional swap as
           well as the dest PE */

        if (me == 0) {
            if (swapped_val1 == 1) {
                success1_p1 = 1;
            }

            if (swapped_val3 == 1) {
                success3_p1 = 1;
            }

            if (swapped_val5 == 1) {
                success5_p1 = 1;
            }
        }

        if (me == 1) {
            if (*dest1 == 0) {
                shmem_int_put (&success1_p2, &success, 1, 0);
            }

            if (*dest3 == 0) {
                shmem_int_put (&success3_p2, &success, 1, 0);
            }

            if (*dest5 == 0) {
                shmem_int_put (&success5_p2, &success, 1, 0);
            }
        }

        shmem_barrier_all ();

        if (me == 0) {
            if (success1_p1 && success1_p2) {
                printf ("Test shmem_int_cswap: Passed\n");
            }
            else {
                printf ("Test shmem_int_cswap: Failed\n");
            }

            if (success3_p1 && success3_p2) {
                printf ("Test shmem_long_cswap: Passed\n");
            }
            else {
                printf ("Test shmem_long_cswap: Failed\n");
            }

            if (success5_p1 && success5_p2) {
                printf ("Test shmem_longlong_cswap: Passed\n");
            }
            else {
                printf ("Test shmem_longlong_cswap: Failed\n");
            }

        }
        shmem_barrier_all ();

        /* Test shmem_long_fadd, shmem_int_fadd, shmem_longlong_fadd */

        *dest1 = *dest3 = *dest5 = me;
        new_val1 = new_val3 = new_val5 = me;
        success1_p1 = success1_p2 = success3_p1 = success3_p2 = success5_p1 =
            success5_p2 = -1;

        shmem_barrier_all ();

        swapped_val1 = shmem_int_fadd (dest1, 1, 0);
        swapped_val3 = shmem_long_fadd (dest3, 1, 0);
        swapped_val5 = shmem_longlong_fadd (dest5, 1, 0);


        /* To validate the working of fetch and add we need to check the old
           value received at the PE that initiated the fetch and increment as
           well as the new value on the dest PE */

        if (me != 0) {
            if (swapped_val1 == 0) {
                success1_p1 = 1;
            }

            if (swapped_val3 == 0) {
                success3_p1 = 1;
            }

            if (swapped_val5 == 0) {
                success5_p1 = 1;
            }
        }

        if (me == 0) {
            if (*dest1 == npes - 1) {
                shmem_int_put (&success1_p2, &success, 1, npes - 1);
            }

            if (*dest3 == npes - 1) {
                shmem_int_put (&success3_p2, &success, 1, npes - 1);
            }

            if (*dest5 == npes - 1) {
                shmem_int_put (&success5_p2, &success, 1, npes - 1);
            }
        }

        shmem_barrier_all ();

        if (me == npes - 1) {
            if (success1_p1 && success1_p2) {
                printf ("Test shmem_int_fadd: Passed\n");
            }
            else {
                printf ("Test shmem_int_fadd: Failed\n");
            }

            if (success3_p1 && success3_p2) {
                printf ("Test shmem_long_fadd: Passed\n");
            }
            else {
                printf ("Test shmem_long_fadd: Failed\n");
            }

            if (success5_p1 && success5_p2) {
                printf ("Test shmem_longlong_fadd: Passed\n");
            }
            else {
                printf ("Test shmem_longlong_fadd: Failed\n");
            }

        }
        shmem_barrier_all ();

        /* Test shmem_long_finc, shmem_int_finc, shmem_longlong_finc */

        *dest1 = *dest3 = *dest5 = me;
        new_val1 = new_val3 = new_val5 = me;
        success1_p1 = success1_p2 = success3_p1 = success3_p2 = success5_p1 =
            success5_p2 = -1;

        shmem_barrier_all ();

        swapped_val1 = shmem_int_finc (dest1, 0);
        swapped_val3 = shmem_long_finc (dest3, 0);
        swapped_val5 = shmem_longlong_finc (dest5, 0);


        /* To validate the working of fetch and increment we need to check the
           old value received at the PE that initiated the fetch and increment
           as well as the new value on the dest PE */

        if (me != 0) {
            if (swapped_val1 == 0) {
                success1_p1 = 1;
            }

            if (swapped_val3 == 0) {
                success3_p1 = 1;
            }

            if (swapped_val5 == 0) {
                success5_p1 = 1;
            }
        }

        if (me == 0) {
            if (*dest1 == npes - 1) {
                shmem_int_put (&success1_p2, &success, 1, npes - 1);
            }

            if (*dest3 == npes - 1) {
                shmem_int_put (&success3_p2, &success, 1, npes - 1);
            }

            if (*dest5 == npes - 1) {
                shmem_int_put (&success5_p2, &success, 1, npes - 1);
            }
        }

        shmem_barrier_all ();

        if (me == npes - 1) {
            if (success1_p1 && success1_p2) {
                printf ("Test shmem_int_finc: Passed\n");
            }
            else {
                printf ("Test shmem_int_finc: Failed\n");
            }

            if (success3_p1 && success3_p2) {
                printf ("Test shmem_long_finc: Passed\n");
            }
            else {
                printf ("Test shmem_long_finc: Failed\n");
            }

            if (success5_p1 && success5_p2) {
                printf ("Test shmem_longlong_finc: Passed\n");
            }
            else {
                printf ("Test shmem_longlong_finc: Failed\n");
            }

        }
        shmem_barrier_all ();

        shmem_free (dest1);
        shmem_free (dest2);
        shmem_free (dest3);
        shmem_free (dest4);
        shmem_free (dest5);

    }
    else {
        printf
            ("Number of PEs must be > 1 to test shmem atomics, test skipped\n");
    }

    shmem_finalize ();

    return 0;
}
예제 #25
0
파일: quick.c 프로젝트: rutayanp/sorting
int main(int argc, char *argv[]){
	
		
	int i,n,next_pivot, pivot;	
	long pSync[_SHMEM_BCAST_SYNC_SIZE];
	
	for (i=0; i < SHMEM_BCAST_SYNC_SIZE; i++) {
 		pSync[i] = _SHMEM_SYNC_VALUE;
		}	
	
	start_pes(0);
	me = shmem_my_pe();
	npes = shmem_n_pes();
	shmem_barrier_all();
	srand (me+time(NULL));

	N = atoi(argv[1]);
	
	//int *nelems = (int*) shmalloc(sizeof(int));

	//int *nelems_import= (int*) shmalloc(sizeof(int));;
	printf("%d: Size = %d with np=%d\n",me,N,npes);
	A = (int *)shmalloc((N/npes)*sizeof(int));
	temp_arr = (int *)shmalloc((N/npes)*sizeof(int));
	if(A==NULL){
		printf("\nOut of memory");
		return 1;
	}
	n= N/npes;
	i=0;
	while(i<N/npes){
		A[i] = rand()%(10000-0);
		i++;
	}
	printf("\nprocess %d elements:",me);
	for(i=0;i<(N/npes);i++){
                printf("%d, ", A[i]);
       		}
	
	next_pivot = A[0];
	
	//the step two of algo.....broadcast the new pivot
	shmem_broadcast32(&next_pivot,A,1,0,0,0,npes,pSync);	
	shmem_barrier_all();	
	pivot = quicksort(A, 0, n-1);
	printf("Process %d the pivot:%d",me, pivot);
	
	shmem_barrier_all(); //just for the sake of clear display...can be removed in the end
	printf("\nThe sorted list is of process %d: ",me);
	for(i=0;i<n;i++){
		printf("%d,  ",A[i]);
		}
	printf("\n");
	printf("the new pivot of process %d: %d\n",me,next_pivot); // to check the broadcast of new pivots
	
	int check,j; //to check the division of the sorted arrays according to the new pivot.
	shmem_barrier_all();
	check = uplowPartition(next_pivot);
	shmem_barrier_all();	
	printf("(%d)",me);	
	for(int j=0;j<N/npes;j++){
		printf("%d, ",A[j]);
		}
	printf("new partition: %d",check);
	shmem_barrier_all();
	if(me < npes/2)
		{
		i=0;
	//	printf("Hello from %d", me);
		printf("\n");
		for(j=check;j<N/npes;j++){
			temp_arr[i] = A[j];
			i++;
			}
		i=0;
		printf("(%d)",me);
		for(j=check;j<N/npes;j++){
                        printf("%d, ",temp_arr[i]) ;
			i++;
                	}
//	printf("\n");
	}
	
	shmem_barrier_all();
	if(me >= npes/2)
		{
		
	//	printf("Hello from %d", me);
		printf("\n");
		for(j=0;j<check;j++){
			temp_arr[j] = A[j];
			}
		
		printf("(%d)",me);
		for(j=0;j<check;j++){
                        printf("%d, ",temp_arr[j]) ;
			
                	}
//	printf("\n");
	}
	shmem_barrier_all();	
	printf("\n");
	
	if(me < npes/2){
		printf("\n");
		pe = me +npes/2;
		nelems[0] = N/npes - check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value

		printf("(%d) addr = %d , value = %d , pe = %d\n ",me, &nelems_import[0],nelems[0],pe);//to test the value

		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,&A[check],nelems[0],pe);
	}

	shmem_barrier_all();//check if the entire barrier is needed
	if(me >= npes/ 2){
		
		pe = me-npes/2;//check if it is synced
		nelems[0]= check;
		printf (" process %d pe : %d nelems : %d\n",me,pe,nelems[0]);//to test the value
		shmem_int_p(nelems_import,nelems[0],pe);
		shmem_quiet();
		shmem_int_put(temp_arr,A,nelems[0],pe);
	}
	
	shmem_barrier_all();//again sync is required...check it with profiling
//this snippet is to check if the processors have got the high and low lists respectively	-------------------
		printf("(%d) nelems_import = %d\n",me,nelems_import[0]);//to test the value
                printf("(%d) new elements = ",me);
                for(i=0;i<nelems_import[0];i++){
                        printf("%d, ",temp_arr[i]);
                }
                printf("\n");
//------------------------------------here this checking snippet ends----

//----------------------------------merging of arrays begin-------------------------
	if(me < npes/2){
		i=0;
		for(j=nelems_import[0];j<(nelems_import[0]+check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}
	

	if(me >= npes/2){
		i=check;
		for(j=nelems_import[0];j<(nelems_import[0]+N/npes-check);j++){
		
			temp_arr[j] = A[i];
			i++;
		}

	}

	shmem_barrier_all(); //to test if the arrays are merged properly
	int size;
	if(me < npes/2){	
		size = (nelems_import[0]+check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
		printf("\n");
	}
		
	if(me >= npes/2){
		size = (nelems_import[0]+N/npes-check);
		printf("(%d) merged array:",me);
		for(j=0;j<size;j++){
			printf("%d, ",temp_arr[j]);
		}
	printf("\n");
	}
			//-----------------------check of merging finishes--------
//--------------------------------------------------merging finishes------------------------------

//-----------------------sort again-----------------------------------------------	
	if(me < npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+check-1));	
	}
	
	if(me >= npes/2){
		quicksort(temp_arr,0,(nelems_import[0]+N/npes-check-1));
	}
	//sorting routine checked...once program is done we can remove this part-------------
	shmem_barrier_all();//test purpose only
	if(me < npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
		}
		printf("\n");
	}
	
	
	if(me >= npes/2){
		printf("(%d) sorted list: ",me);
		for(i=0;i<size;i++){
			printf("%d, ",temp_arr[i]);
		}
		printf("\n");
	}
	//-------------------------------------------------------------
//---------------------------------------------------------------------------------
	
shfree(temp_arr);
shfree(A);
shmem_finalize();
}
예제 #26
0
파일: isx.c 프로젝트: habanero-rice/hclib
/*
 * Each PE sends the contents of its local buckets to the PE that owns that bucket.
 */
static KEY_TYPE * exchange_keys(int const * const send_offsets,
                                       int const * const local_bucket_sizes,
                                       KEY_TYPE const * const my_local_bucketed_keys)
{
  timer_start(&timers[TIMER_ATA_KEYS]);

  const int my_rank = shmem_my_pe();
  unsigned int total_keys_sent = 0;

  // Keys destined for local key buffer can be written with memcpy
  const long long int write_offset_into_self = shmem_longlong_fadd(
          &receive_offset, (long long int)local_bucket_sizes[my_rank], my_rank);
  assert((unsigned long long)write_offset_into_self +
          (unsigned long long)local_bucket_sizes[my_rank] <= KEY_BUFFER_SIZE);
  memcpy(&my_bucket_keys[write_offset_into_self], 
         &my_local_bucketed_keys[send_offsets[my_rank]], 
         local_bucket_sizes[my_rank]*sizeof(KEY_TYPE));


  for(uint64_t i = 0; i < NUM_PES; ++i){

#ifdef PERMUTE
    const int target_pe = permute_array[i];
#elif INCAST
    const int target_pe = i;
#else
    const int target_pe = (my_rank + i) % NUM_PES;
#endif

    // Local keys already written with memcpy
    if(target_pe == my_rank){ continue; }

    const int read_offset_from_self = send_offsets[target_pe];
    const int my_send_size = local_bucket_sizes[target_pe];

    const long long int write_offset_into_target = shmem_longlong_fadd(
            &receive_offset, (long long int)my_send_size, target_pe);

#ifdef DEBUG
    printf("Rank: %d Target: %d Offset into target: %lld Offset into myself: %d Send Size: %d\n",
        my_rank, target_pe, write_offset_into_target, read_offset_from_self, my_send_size);
#endif

    // fprintf(stderr, "PUTTING %llu\n", my_send_size);
    assert((unsigned long long)write_offset_into_target +
            (unsigned long long)my_send_size <= KEY_BUFFER_SIZE);
    assert((unsigned long long)read_offset_from_self +
            (unsigned long long)my_send_size <= NUM_KEYS_PER_PE);
    shmem_int_put(&(my_bucket_keys[write_offset_into_target]), 
                  &(my_local_bucketed_keys[read_offset_from_self]), 
                  my_send_size, 
                  target_pe);

    total_keys_sent += my_send_size;
  }

#ifdef BARRIER_ATA
  SHMEM_BARRIER_AT_EXCHANGE;
#endif

  timer_stop(&timers[TIMER_ATA_KEYS]);
  timer_count(&timers[TIMER_ATA_KEYS], total_keys_sent);

#ifdef DEBUG
  wait_my_turn();
  char msg[1024];
  sprintf(msg,"Rank %d: Bucket Size %lld | Total Keys Sent: %u | Keys after exchange:", 
                        my_rank, receive_offset, total_keys_sent);
  for(long long int i = 0; i < receive_offset; ++i){
    if(i < PRINT_MAX)
    sprintf(msg + strlen(msg),"%d ", my_bucket_keys[i]);
  }
  sprintf(msg + strlen(msg),"\n");
  printf("%s",msg);
  fflush(stdout);
  my_turn_complete();
#endif

  return my_bucket_keys;
}
예제 #27
0
int main(int argc, char **argv)
{
  int j;
  int my_pe,n_pes;
  int *flag,*one;
  size_t max_elements,max_elements_bytes;

  char *srce_char,*targ_char;
  short *srce_short,*targ_short;
  int *srce_int,*targ_int;
  long *srce_long,*targ_long;

  shmem_init();
  my_pe = shmem_my_pe();
  n_pes = shmem_n_pes();
  flag = shmem_malloc((size_t) sizeof(int));
  one  = shmem_malloc((size_t) sizeof(int));
  *one  = 1;

/*  fail if trying to use odd number of processors  */
  if ( (n_pes % 2) != 0 ){
        fprintf(stderr, "FAIL - test requires even number of PEs\n");
        exit(1);
  }

  if(my_pe == 0)
    fprintf(stderr, "shmem_num_put_nb(%s)\n", argv[0]);

/*  shmem_putmem_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(char));
  max_elements_bytes = (size_t) (sizeof(char)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_putmem_nb         max_elements = %d\n",max_elements);
  srce_char = shmem_malloc(max_elements_bytes);
  targ_char = shmem_malloc(max_elements_bytes);
  if((srce_char == NULL) || (targ_char == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++) 
      srce_char[j] = (char)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++) 
      targ_char[j] = (char)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_putmem_nb(targ_char,srce_char,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_char[j] != (char)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_char[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_char[j],my_pe+j-1);
  }
  shmem_free(srce_char);  shmem_free(targ_char);

/*  shmem_put16_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(short));
  if(max_elements > 20000) max_elements=20000;
  max_elements_bytes = (size_t) (sizeof(short)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put16_nb          max_elements = %d\n",max_elements);
  srce_short = shmem_malloc(max_elements_bytes);
  targ_short = shmem_malloc(max_elements_bytes);
  if((srce_short == NULL) || (targ_short == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++) 
      srce_short[j] = (short)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++) 
      targ_short[j] = (short)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put16_nb(targ_short,srce_short,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_short[j] != (short)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_short[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_short[j],my_pe+j-1);
  }
  shmem_free(srce_short);  shmem_free(targ_short);

/*  shmem_put32_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put32_nb          max_elements = %d\n",max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_int[j] = (int)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_int[j] = (int)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put32_nb(targ_int,srce_int,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_int[j] != (int)(my_pe+j-1) )
	fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_int[j],my_pe+j-1);
  }
  shmem_free(srce_int);  shmem_free(targ_int);
  
/*  shmem_put64_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put64_nb          max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put64_nb(targ_long,srce_long,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shmem_free(srce_long);  shmem_free(targ_long);

/*  shmem_put128_nb test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  if ( (max_elements % 2) != 0)
    max_elements = max_elements-1;
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  max_elements = max_elements/2;
  if(my_pe == 0)
    fprintf(stderr,"shmem_put128_nb         max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < 2*max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < 2*max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put128_nb(targ_long,srce_long,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < 2*max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shmem_free(srce_long);  shmem_free(targ_long);

#ifdef SHMEM_C_GENERIC_32

/*  shmem_put_nb (GENERIC 32) test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put_nb (GENERIC 32)  max_elements = %d\n",max_elements);
  srce_int = shmem_malloc(max_elements_bytes);
  targ_int = shmem_malloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_int[j] = (int)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_int[j] = (int)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put_nb(targ_int,srce_int,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_int[j] != (int)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_int[j],my_pe+j-1);
  }
  shmem_free(srce_int);  shmem_free(targ_int);

#else

/*  shmem_put_nb (GENERIC 64) test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put_nb (GENERIC 64)  max_elements = %d\n",max_elements);
  srce_long = shmem_malloc(max_elements_bytes);
  targ_long = shmem_malloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put_nb(targ_long,srce_long,max_elements,my_pe+1,NULL);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shmem_free(srce_long);  shmem_free(targ_long);

#endif

#ifdef NEEDS_FINALIZE
  shmem_finalize(); 
#endif
  return 0;
}
예제 #28
0
int
main (int argc, char *argv[])
{
    int *sray, *rray;
    int *sdisp, *scounts, *rdisp, *rcounts, *rcounts_full;
    int ssize, rsize, i, k, j;
    float z;

    init_it (&argc, &argv);
    scounts = (int *) shmem_malloc (sizeof (int) * numnodes);
    rcounts = (int *) shmem_malloc (sizeof (int) * numnodes);
    rcounts_full = (int *) shmem_malloc (sizeof (int) * numnodes * numnodes);
    sdisp = (int *) shmem_malloc (sizeof (int) * numnodes);
    rdisp = (int *) shmem_malloc (sizeof (int) * numnodes);
    /* 
       ! seed the random number generator with a ! different number on each
       processor */
    seed_random (myid);
    /* find out how much data to send */
    for (i = 0; i < numnodes; i++) {
        random_number (&z);
        scounts[i] = (int) (5.0 * z) + 1;
    }
    printf ("myid= %d scounts=%d %d %d %d\n", myid, scounts[0], scounts[1],
            scounts[2], scounts[3]);
    printf ("\n");
    /* tell the other processors how much data is coming */
    // mpi_err = MPI_Alltoall(scounts,1,MPI_INT, rcounts,1,MPI_INT,
    // MPI_COMM_WORLD);
    static long psync[_SHMEM_COLLECT_SYNC_SIZE];
    for (i = 0; i < _SHMEM_COLLECT_SYNC_SIZE; i++)
        psync[i] = _SHMEM_SYNC_VALUE;
    shmem_barrier_all ();
    int other, j1;
    shmem_fcollect32 (rcounts_full, scounts, 4, 0, 0, numnodes, psync);
    for (i = 0; i < numnodes; i++) {
        rcounts[i] = rcounts_full[i * numnodes + myid];
    }
    printf ("-----myid= %d rcounts=", myid);
    for (i = 0; i < numnodes; i++)
        printf ("%d ", rcounts[i]);
    printf ("\n");


    /* write(*,*)"myid= ",myid," rcounts= ",rcounts */
    /* calculate displacements and the size of the arrays */
    sdisp[0] = 0;
    for (i = 1; i < numnodes; i++) {
        sdisp[i] = scounts[i - 1] + sdisp[i - 1];
    }
    rdisp[0] = 0;
    for (i = 1; i < numnodes; i++) {
        rdisp[i] = rcounts[i - 1] + rdisp[i - 1];
    }
    ssize = 0;
    rsize = 0;
    for (i = 0; i < numnodes; i++) {
        ssize = ssize + scounts[i];
        rsize = rsize + rcounts[i];
    }

    /* allocate send and rec arrays */
    sray = (int *) shmem_malloc (sizeof (int) * 20);
    rray = (int *) shmem_malloc (sizeof (int) * 20);
    for (i = 0; i < ssize; i++) {
        sray[i] = myid;
    }
    /* send/rec different amounts of data to/from each processor */
    // mpi_err = MPI_Alltoallv(sray,scounts,sdisp,MPI_INT,
    // rray,rcounts,rdisp,MPI_INT, MPI_COMM_WORLD);
    shmem_barrier_all ();
    for (j1 = 0; j1 < numnodes; j1++) {
        int k1 = sdisp[j1];
        static int k2;
        shmem_int_get (&k2, &rdisp[myid], 1, j1);
        shmem_int_put (rray + k2, sray + k1, scounts[j1], j1);
    }
    shmem_barrier_all ();

    // not possible, coz even though the rcounts[myid] will be different on
    // each PE, the elements collected
    // by PE0 from other PE's will be constant.
    // shmem_collect32(rray_full,sray,rcounts[myid],0,0,numnodes,psync);

    printf ("myid= %d rray=", myid);
    for (i = 0; i < rsize; i++)
        printf ("%d ", rray[i]);
    printf ("\n");
    // mpi_err = MPI_Finalize();
    shmem_finalize ();
    return 0;
}
예제 #29
0
int main(int argc, char *argv[])
{
  int size, rank, world_rank, my_group;
  int num_lsms; // number of parallel LSMS instances
  int size_lsms; // number of atoms in a lsms instance
  int num_steps; // number of energy calculations
  int initial_steps; // number of steps before sampling starts
  int stepCount=0; // count the Monte Carlo steps executed
  double max_time; // maximum walltime for this run in seconds
  bool restrict_time = false;       // was the maximum time specified?
  bool restrict_steps = false; // or the max. numer of steps?
  int align; // alignment of lsms_instances
  
  double magnetization;
  double energy_accumulator; // accumulates the enegy to calculate the mean
  int energies_accumulated;


  int new_peid,new_root;
  static int op,flag;
  double *evec,*r_values;
  evec=(double *)shmalloc(sizeof(double)*3*size_lsms);
  r_values=(double *)shmalloc(sizeof(double)*(R_VALUE_OFFSET+3*(size_lsms+1)));




  energy_accumulator=0.0;
  energies_accumulated=0;

  double walltime_0,walltime;

  double restartWriteFrequency=30.0*60.0;
  double nextWriteTime=restartWriteFrequency;

  MPI_Comm local_comm;
  int *lsms_rank0;
  MPI_Status status;

  char prefix[40];
  char i_lsms_name[64];
  char gWL_in_name[64], gWL_out_name[64];
  char mode_name[64];
  char energy_calculation_name[64];
  char stupid[37];

  char step_out_name[64];
  char wl_step_out_name[128];
  char *wl_stepf=NULL;
  bool step_out_flag=false;
  std::ofstream step_out_file;
  typedef enum {Constant, Random, WangLandau_1d, ExhaustiveIsing, WangLandau_2d} EvecGenerationMode;
  typedef enum {MagneticMoment, MagneticMomentZ, MagneticMomentX, MagneticMomentY} SecondDimension;

  EvecGenerationMode evec_generation_mode = Constant;
  SecondDimension second_dimension = MagneticMoment;
  double ev0[3];

  bool return_moments_flag=true; // true-> return all magnetic moments from lsms run at each step.
  bool generator_needs_moment=false;

  typedef enum {OneStepEnergy, MultiStepEnergy, ScfEnergy} EnergyCalculationMode;
  EnergyCalculationMode energyCalculationMode = OneStepEnergy;
  int energyIndex=1; // index for the return value to use for the MC step (0: total energy, 1: band energy)

  ev0[0]=ev0[1]=0.0; ev0[2]=1.0;
  // size has to be align + size_lsms*num_lsms
  align=1;
  num_lsms=1;
  size_lsms=-1;
  my_group=-1;
  num_steps=1;
  initial_steps=0;

  sprintf(i_lsms_name,"i_lsms");
  gWL_in_name[0]=gWL_out_name[0]=0;
  mode_name[0]=0;
  energy_calculation_name[0]=0;

  // check command line arguments
  for(int i=0; i<argc; i++)
  {
    if(!strcmp("-num_lsms",argv[i])) num_lsms=atoi(argv[++i]);
    if(!strcmp("-size_lsms",argv[i])) size_lsms=atoi(argv[++i]);
    if(!strcmp("-align",argv[i])) align=atoi(argv[++i]);
    if(!strcmp("-num_steps",argv[i])) {num_steps=atoi(argv[++i]); restrict_steps=true;}
    if(!strcmp("-initial_steps",argv[i])) initial_steps=atoi(argv[++i]); 
    if(!strcmp("-walltime",argv[i])) {max_time=60.0*atof(argv[++i]); restrict_time=true;}
    if(!strcmp("-i",argv[i])) strncpy(i_lsms_name,argv[++i],64);
    if(!strcmp("-random_dir",argv[i])) {evec_generation_mode = Random;}
    if(!strcmp("-step_out",argv[i]))
    {strncpy(step_out_name,argv[++i],64); step_out_flag=true;
      return_moments_flag=true;}
    if(!strcmp("-wl_out", argv[i])) strncpy(gWL_out_name,argv[++i],64);
    if(!strcmp("-wl_in", argv[i])) strncpy(gWL_in_name,argv[++i],64);
    if(!strcmp("-mode", argv[i])) strncpy(mode_name,argv[++i],64);
    if(!strcmp("-energy_calculation",argv[i])) strncpy(energy_calculation_name,argv[++i],64);
  }

  if(!(restrict_steps || restrict_time)) restrict_steps=true;

  if(mode_name[0]!=0)
  {
    if(!strcmp("constant",mode_name)) evec_generation_mode = Constant;
    if(!strcmp("random",mode_name)) evec_generation_mode = Random;
    if(!strcmp("1d",mode_name)) evec_generation_mode = WangLandau_1d;
    if(!strcmp("ising",mode_name)) evec_generation_mode = ExhaustiveIsing;
    if(!strcmp("2d",mode_name)) evec_generation_mode = WangLandau_2d;
    if(!strcmp("2d-m",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMoment;}
    if(!strcmp("2d-x",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentX;}
    if(!strcmp("2d-y",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentY;}
    if(!strcmp("2d-z",mode_name)) {evec_generation_mode = WangLandau_2d; second_dimension=MagneticMomentZ;}
  }

  if(energy_calculation_name[0]!=0)
  {
    if(energy_calculation_name[0]=='o') { energyCalculationMode = OneStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='m') { energyCalculationMode = MultiStepEnergy; energyIndex=1; }
    if(energy_calculation_name[0]=='s') { energyCalculationMode = ScfEnergy; energyIndex=0; }
  }

#ifdef USE_PAPI
#define NUM_PAPI_EVENTS 4
  int hw_counters = PAPI_num_counters();
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  int papi_events[NUM_PAPI_EVENTS]; // = {PAPI_TOT_INS,PAPI_TOT_CYC,PAPI_FP_OPS,PAPI_VEC_INS};
  char *papi_event_name[] = {"PAPI_TOT_INS","PAPI_FP_OPS",
                             "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:OP_TYPE",
                             "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:OP_TYPE"};
  // "RETIRED_INSTRUCTIONS",
  // "RETIRED_MMX_AND_FP_INSTRUCTIONS:PACKED_SSE_AND_SSE2",
  // "RETIRED_SSE_OPERATIONS:DOUBLE_ADD_SUB_OPS:DOUBLE_MUL_OPS:DOUBLE_DIV_OPS:1",
  // "RETIRED_SSE_OPERATIONS:SINGLE_ADD_SUB_OPS:SINGLE_MUL_OPS:SINGLE_DIV_OPS:1"
  // get events from names:
  for(int i=0; i<NUM_PAPI_EVENTS; i++)
  {
    if(PAPI_event_name_to_code(papi_event_name[i],&papi_events[i]) != PAPI_OK)
    {
      // printline("Error in obtaining PAPI event code for: "+ttos(papi_event_name[i]),
      //           std::cerr,parameters.myrankWorld);
      // printline("Skipping all following events",
      //           std::cerr,parameters.myrankWorld);
      if(hw_counters>i) hw_counters=i;
    }
  }
  long long papi_values[NUM_PAPI_EVENTS+4];
  // printline("PAPI: "+ttos(hw_counters)+" counters available",std::cout,parameters.myrankWorld);
  if(hw_counters>NUM_PAPI_EVENTS) hw_counters=NUM_PAPI_EVENTS;
  long long papi_real_cyc_0 = PAPI_get_real_cyc();
  long long papi_real_usec_0 = PAPI_get_real_usec();
  long long papi_virt_cyc_0 = PAPI_get_virt_cyc();
  long long papi_virt_usec_0 = PAPI_get_virt_usec();
  PAPI_start_counters(papi_events,hw_counters);
#endif


  lsms_rank0=(int *)malloc(sizeof(int)*(num_lsms+1));

  // initialize MPI:
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  world_rank=rank;
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  walltime_0 = get_rtc();

#ifndef SVN_REV
#define SVN_REV "unknown"
#endif

// make sure 'return_moments_flag' is set correctly
  switch(evec_generation_mode)
  {
  case Constant : break;
  case Random : break;
  case WangLandau_1d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  case ExhaustiveIsing : break;
  case WangLandau_2d :
    return_moments_flag = true;
    generator_needs_moment = true;
    break;
  default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
  }

  if(rank==0)
  {
    std::cout<<"LSMS_3"<<std::endl;
    std::cout<<" SVN revision "<<SVN_REV<<std::endl<<std::endl;
#ifdef USE_PAPI
    std::cout<<" Using Papi counters"<<std::endl<<std::endl; 
#endif
    std::cout<<" Size of LSMS instances = "<<size_lsms<<" atoms\n";
    std::cout<<" Number of LSMS instances = "<<num_lsms<<std::endl;
    std::cout<<" LSMS Energy calculated using ";
    switch(energyCalculationMode)
    {
    case OneStepEnergy: std::cout<<"oneStepEnergy [frozen potential band energy]"<<std::endl; break;
    case MultiStepEnergy: std::cout<<"multiStepEnergy [frozen potential band energy with converged Fermi energy]"<<std::endl; break;
    case ScfEnergy: std::cout<<"scfEnergy [self-consistent total energy]"<<std::endl; break;
    default: std::cout<<"UNKNOWN ENERGY CALCULATION METHOD"<<std::endl; exit(1);
    }
    if(restrict_steps) std::cout<<" Number of gWL steps = "<<num_steps<<std::endl;
    if(restrict_time) std::cout<<" Maximum walltime = "<<max_time<<"s\n";
    std::cout<<" Processor alignment (process allocation quantization) = "<<align<<std::endl;
    switch(evec_generation_mode)
    {
    case Constant : std::cout<<" Constant moments direction along "
                             <<ev0[0]<<" "<<ev0[1]<<" "<<ev0[2]<<std::endl;
      break;
    case Random : std::cout<<" Random distribution of moments (no Wang-Landau)"<<std::endl;
      break;
    case WangLandau_1d : std::cout<<" Wang-Landau for one continuous variable (energy)"<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    case ExhaustiveIsing : std::cout<<" Exhaustive Ising sampling"<<std::endl; break;
    case WangLandau_2d : std::cout<<" Wang-Landau for two continuous variable (energy, ";
      switch(second_dimension)
      {
      case MagneticMoment  : std::cout<<"magnitude of magnetization)"; break;
      case MagneticMomentX : std::cout<<"x component of magnetization)"; break;
      case MagneticMomentY : std::cout<<"y component of magnetization)"; break;
      case MagneticMomentZ : std::cout<<"z component of magnetization)"; break;
      }
      std::cout<<std::endl;
//      return_moments_flag = true;
//      generator_needs_moment = true;
      break;
    default: std::cout<<" ERROR: UNKNOWN EVEC GENERATION MODE\n"; exit(1);
    }
    if(step_out_flag) std::cout<<" Step output written to: "<<step_out_name<<std::endl;
    std::cout<<std::endl;

    if(step_out_flag && (evec_generation_mode==WangLandau_1d))
    {
      // step_out_flag=false;
      snprintf(wl_step_out_name,127,"wl1d_%s",step_out_name);
      wl_stepf=wl_step_out_name;
    }

    if(step_out_flag)
    {
      step_out_file.open(step_out_name);
      step_out_file<<"#";
      for(int i=0; i<argc; i++) step_out_file<<" "<<argv[i];
      step_out_file<<std::endl<<size_lsms<<std::endl;
    }
  }

  if(generator_needs_moment) return_moments_flag=true;

  if(num_lsms==1)
  {
    SHMEM_activeset local_comm;
    local_comm.rank=shmem_my_pe();
    local_comm.size=shmem_n_pes();
    local_comm.start_pe=0;
    local_comm.logPE_stride=0;
    LSMS lsms_calc(local_comm,i_lsms_name,"1_");
      
    if(rank==0)
    {
      std::cout<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
      std::cout<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
    }

    if(energyCalculationMode==OneStepEnergy)
      std::cout<<"one step Energy = "<<lsms_calc.oneStepEnergy()<<std::endl;
    else if(energyCalculationMode==MultiStepEnergy)
      std::cout<<"multi-step Energy = "<<lsms_calc.multiStepEnergy()<<std::endl;
    else if(energyCalculationMode==ScfEnergy)
      std::cout<<"self-consistent Energy = "<<lsms_calc.scfEnergy()<<std::endl;
    else
    {
      printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
     // MPI_Abort(MPI_COMM_WORLD,5);
      exit(5);
    }
  }
  else
  {
    // build the communicators
    //int color=MPI_UNDEFINED;
    //Assuming user passes a power of two while using "-align"
    int s = align;
    int comm_size=(size-align)/num_lsms;
    int world_rank;
    for(int i=0; i<num_lsms; i++)
    {
      if((world_rank>=s) && (world_rank<s+comm_size)) 
      { 
        my_group=i; 
        //color=i; 
        new_peid=world_rank-s;
        new_root=s;
      }
      lsms_rank0[i]=s;
      s+=comm_size;
    }
    if(world_rank==0){ 
      //color=num_lsms;
      new_peid=0;
      comm_size=1;
      new_root=0;
    }

    //MPI_Comm_split(MPI_COMM_WORLD, color, 0, &local_comm);
    SHMEM_activeset local_comm;
    local_comm.rank=new_peid;
    local_comm.size=comm_size;
    local_comm.start_pe=new_root;
    local_comm.logPE_stride=0;

    std::cout<<"world_rank="<<world_rank<<" -> group="<<my_group<<std::endl;

      
    snprintf(prefix,38,"Group %4d: ",my_group);

    // now we get ready to do some calculations...

    if(my_group>=0)
    {
      double energy;
      double band_energy;
      int static i_values[10];
      double static r_values[10];
      static int op;


      //MPI_Comm_rank(local_comm, &rank);
      rank = local_comm.rank;
      snprintf(prefix,38,"%d_",my_group);
      // to use the ramdisk on jaguarpf:
      // snprintf(prefix,38,"/tmp/ompi/%d_",my_group);
      LSMS lsms_calc(local_comm,i_lsms_name,prefix);
      snprintf(prefix,38,"Group %4d: ",my_group);

      if(rank==0 && my_group==0)
      {
        std::cout<<prefix<<"executing LSMS(C++) for "<<lsms_calc.numSpins()<<" atoms\n";
        std::cout<<prefix<<"  LSMS version = "<<lsms_calc.version()<<std::endl;
      }

      // wait for commands from master
      bool finished=false;
      while(!finished)
      {
        if(rank==0)
        {
          //MPI_Recv(evec,3*size_lsms,MPI_DOUBLE,0,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //op =status.MPI_TAG;
          if (lsms_rank0[0]==world_rank)
                shmem_barrier(0, lsms_rank0[0], 2, pSync1);

        }
        //MPI_Bcast(&op,1,MPI_INT,0,local_comm);
        shmem_broadcast32(&op, &op, 1, local_comm.start_pe, local_comm.start_pe, local_comm.logPE_stride, local_comm.size, pSync2); 

/* recognized opcodes:
   5: calculate energy

   recognized energy calculation modes:
   OneStepEnergy : calclulate frozen potential band energy in one step (don't converge Ef)
   use only if the Fermi energy will not change due to MC steps!
   The only method available in LSMS_1.9
   MultiStepEnergy : calculate frozen potential band energy after converging Fermi energy
   This should be the new default method. If the Fermi energy doesn't change
   multiStepEnergy only performs one step and should be equivalent to oneStepEnergy
   The tolerance for Ef convergence can be set with LSMS::setEfTol(Real).
   The default tolerance is set in the LSMS::LSMS constructor (currently 1.0e-6).
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   ScfEnergy : this will calculate the selfconsistent total energy.
   The maximum number of steps is read from the LSMS input file 'nscf' parameter.
   NOT IMPLEMENTED YET!!!

   10: get number of sites
*/

        if(op==5)
        {
          lsms_calc.setEvec(evec);
          if(energyCalculationMode==OneStepEnergy)
            energy=lsms_calc.oneStepEnergy(&band_energy);
          else if(energyCalculationMode==MultiStepEnergy)
            band_energy=energy=lsms_calc.multiStepEnergy();
          else if(energyCalculationMode==ScfEnergy)
            energy=lsms_calc.scfEnergy(&band_energy);
          else
          {
            printf("ERROR: Unknown energy calculation mode for lsms_calc in wl-lsms main!\n");
            //MPI_Abort(MPI_COMM_WORLD,5);
            exit(5);
          }
          r_values[0]=energy;
          r_values[1]=band_energy;
          if(return_moments_flag)
          {
            lsms_calc.getMag(&r_values[R_VALUE_OFFSET]);
          }
          if(rank==0)
          {
            if(return_moments_flag)
            {
              //MPI_Send(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET+3*size_lsms, 0);

            } else {
              //MPI_Send(r_values,R_VALUE_OFFSET,MPI_DOUBLE,0,1005,MPI_COMM_WORLD);
              shmem_double_put(r_values, r_values, R_VALUE_OFFSET, 0);
            }
            shmem_fence();
            shmem_int_swap(&flag, world_rank, 0);

          }
              
        } else if(op==10) {
          i_values[0]=lsms_calc.numSpins();
          //MPI_Send(i_values,10,MPI_INT,0,1010,MPI_COMM_WORLD);
          shmem_int_put(i_values, i_values, 10, 0);
        } else {
          // printf("world rank %d: recieved exit\n",world_rank); 
          finished=true;
        }
      }

      shfree(evec);
      //shfree(r_values);
    }
    else if(world_rank==0)
    {
      int running;
      double **evecs;
      //double *r_values;
      //int i_values[10];
      int *init_steps;
      int total_init_steps;
      bool accepted;
        
      char *wl_inf=NULL;
      char *wl_outf=NULL;
      if(gWL_in_name) wl_inf=gWL_in_name;
      if(gWL_out_name) wl_outf=gWL_out_name;
        
      EvecGenerator *generator;

/*
      // get number of spins from first LSMS instance
      // temp r_values:
      r_values=(double *)malloc(sizeof(double)*10);
      MPI_Send(r_values,1,MPI_DOUBLE, lsms_rank0[0], 10, MPI_COMM_WORLD);
      free(r_values);
      MPI_Recv(i_values,10,MPI_INT,lsms_rank0[0],1010,MPI_COMM_WORLD,&status);
      if(i_values[0]!=size_lsms)
      {
        printf("Size specified for Wang-Landau and in LSMS input file don't match!\n");
        size_lsms=i_values[0];
      }
*/

      evecs=(double **)shmalloc(sizeof(double *)*num_lsms);
      init_steps=(int *)shmalloc(sizeof(int)*num_lsms);
      for(int i=0; i<num_lsms; i++)
      {
        evecs[i]=(double *)shmalloc(sizeof(double)*3*size_lsms);
        init_steps[i]=initial_steps;
      }
      total_init_steps=num_lsms*initial_steps;
        

      // Initialize the correct evec generator
      switch(evec_generation_mode)
      {
      case Random :  generator = new RandomEvecGenerator(size_lsms);
        break;
      case Constant: generator = new ConstantEvecGenerator(size_lsms, ev0, num_lsms);
        break;
     //case WangLandau_1d : generator = new WL1dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
     //                                                                      evecs, wl_inf, wl_outf, wl_stepf);
     case WangLandau_1d : generator = new WL1dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      case ExhaustiveIsing : generator = new ExhaustiveIsing1dEvecGenerator(size_lsms, num_lsms,
                                                                            evecs, wl_inf, wl_outf);
        break;
      //case WangLandau_2d : generator = new WL2dEvecGenerator<std::mt19937>(size_lsms, num_lsms,
      //                                                                     evecs, wl_inf, wl_outf, wl_stepf);
      case WangLandau_2d : generator = new WL2dEvecGenerator<boost::mt19937>(size_lsms, num_lsms,
                                                                           evecs, wl_inf, wl_outf, wl_stepf);
        break;
      default: std::cerr<<"The code should never arrive here: UNKNOWN EVEC GENERATION MODE\n";
        exit(1);
      }

      for(int i=0; i<num_lsms; i++)
      {
        generator->initializeEvec(i,evecs[i]);
      }
      std::cout<<"This is the master node\n";
      // issue initial commands to all LSMS instances
      running=0;
      bool more_work=true;
      if(total_init_steps>0)
      {
        for(int i=0; i<num_lsms; i++)
        {
          std::cout<<"starting initial calculation in group "<<i<<std::endl;
          //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
          shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
          shmem_int_p(&op, 5, lsms_rank0[i]);
          shmem_fence();


          num_steps--; running++; stepCount++;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
        }
        shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        // first deal with the initial steps:
        while(running>0)
        {
          //if(return_moments_flag)
          //  MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          //else
          //  MPI_Recv(r_values,R_VALUE_OFFSET,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
          
          shmem_int_wait(&flag,-1);

          running--;
          // std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
          // std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
          if(total_init_steps>0)
          {
            //int r_group=(status.MPI_SOURCE-align)/comm_size;
            int r_group=(flag-align)/comm_size;
            std::cout<<"starting additional calculation in group "<<r_group<<std::endl;

            if(init_steps[r_group]>0)
            {
              more_work = !(generator->generateUnsampledEvec(r_group,evecs[r_group],r_values[energyIndex]));
              init_steps[r_group]--; total_init_steps--;
            }
                
            //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
            shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
            shmem_fence();
                
            num_steps--; running++; stepCount++;
            if(restrict_steps && num_steps<=0) more_work=false;
            if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
            walltime = get_rtc() - walltime_0;
            if(restrict_time && walltime>=max_time) more_work=false;
            if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
          }
              
        }
      }
      more_work=true;
      running=0;
      for(int i=0; i<num_lsms; i++)
      {
        std::cout<<"starting main calculation in group "<<i<<std::endl;
        //MPI_Send(evecs[i], 3*size_lsms, MPI_DOUBLE, lsms_rank0[i], 5, MPI_COMM_WORLD);
        shmem_double_put(evec, evecs[i], 3*size_lsms, lsms_rank0[i]);
        shmem_int_p(&op, 5, lsms_rank0[i]);
        shmem_fence();
        num_steps--; running++; stepCount++;
        if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
      }
      shmem_barrier(0, lsms_rank0[0], 2, pSync1);
        
      generator->startSampling();
      // wait for results and issue new commands or wind down
      while(running>0)
      {
        //MPI_Recv(r_values,R_VALUE_OFFSET+3*size_lsms,MPI_DOUBLE,MPI_ANY_SOURCE,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
        shmem_int_wait(&flag,-1);

        running--;
        std::cout<<"received energy E_tot ="<<r_values[0]<<std::endl;
        std::cout<<"    band energy E_band="<<r_values[1]<<std::endl;
        // printf("from status.MPI_SOURCE=%d\n",status.MPI_SOURCE);
        energy_accumulator+=r_values[0]; energies_accumulated++;
        if(more_work)
        {
          int r_group=(status.MPI_SOURCE-align)/comm_size;
          std::cout<<"starting additional calculation in group "<<r_group<<std::endl;
              
          if(generator_needs_moment)
          {
            double m0,m1,m2;
            m0=0.0; m1=0.0; m2=0.0;
            for(int i=0; i<3*size_lsms; i+=3)
            {
              m0+=r_values[R_VALUE_OFFSET+i];
              m1+=r_values[R_VALUE_OFFSET+i+1];
              m2+=r_values[R_VALUE_OFFSET+i+2];
            }
            switch(second_dimension)
            {
            case  MagneticMoment : magnetization=std::sqrt(m0*m0+m1*m1+m2*m2); break;
            case  MagneticMomentX : magnetization=m0; break;
            case  MagneticMomentY : magnetization=m1; break;
            case  MagneticMomentZ : magnetization=m2; break;
            }
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex],magnetization, &accepted))
              more_work=false;
          } else {
            if(generator->generateEvec(r_group,evecs[r_group],r_values[energyIndex], &accepted)) more_work=false;
          }

          //MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 5, MPI_COMM_WORLD);
          shmem_double_put(r_values, evecs[r_group],  3*size_lsms, lsms_rank0[r_group]); //TODO check this
          shmem_fence();

          num_steps--; running++; stepCount++;
          if(restrict_steps && num_steps<=0) more_work=false;
          if(restrict_steps) std::cout<<"      "<<num_steps<<" steps remaining\n";
          walltime = get_rtc() - walltime_0;
          if(restrict_time && walltime>=max_time) more_work=false;
          if(restrict_time) std::cout<<"      "<<max_time-walltime<<" seconds remaining\n";
        }
        else
        {
          // send an exit message to this instance of LSMS
          int r_group=(status.MPI_SOURCE-align)/comm_size;

          MPI_Send(evecs[r_group], 3*size_lsms, MPI_DOUBLE, lsms_rank0[r_group], 2, MPI_COMM_WORLD);
        }

        if(step_out_flag && accepted)
        {
          step_out_file<<"# iteration "<<energies_accumulated<<std::endl;
          step_out_file.precision(15);
          step_out_file<<energies_accumulated<<std::endl;
          step_out_file<<r_values[0]<<"  "<<r_values[1]<<std::endl;
          for(int j=0; j<3*size_lsms; j+=3)
          {
            step_out_file<<r_values[j+R_VALUE_OFFSET]<<"  "<<r_values[j+R_VALUE_OFFSET+1]
                         <<"  "<<r_values[j+R_VALUE_OFFSET+2]<<std::endl;
          }
        }
        // write restart file every restartWriteFrequency seconds
        if(walltime>nextWriteTime)
        {
          generator->writeState("WLrestart.jsn");
          nextWriteTime+=restartWriteFrequency;
        }

      }
      generator->writeState("WLrestart.jsn");
/*
  if(evec_generation_mode==WangLandau_1d)
  (static_cast<WL1dEvecGenerator<std::mt19937> *>(generator))->writeState("WLrestart.state");
  if(evec_generation_mode==ExhaustiveIsing)
  (static_cast<ExhaustiveIsing1dEvecGenerator *>(generator))->writeState("WLrestart.state");
*/
      for(int i=0; i<num_lsms; i++) free(evecs[i]);
      shfree(evecs);
      //shfree(r_values);
    }
  }

  if(world_rank==0)
  {
    if(step_out_flag)
    {
      step_out_file<<"# end\n-1\n"
                   <<energy_accumulator/double(energies_accumulated)<<std::endl;
      step_out_file.close();
    }
    std::cout<<"Finished all scheduled calculations. Freeing resources.\n";
    std::cout<<"Energy mean = "<<energy_accumulator/double(energies_accumulated)<<"Ry\n";
  }


  if(num_lsms>1)
  {
    // make sure averyone arrives here:
    MPI_Bcast(stupid,37,MPI_CHAR,0,MPI_COMM_WORLD);

    if(world_rank==0)
    {
      MPI_Comm_free(&local_comm);
    }
    else if(my_group>=0)
    {
      MPI_Comm_free(&local_comm);
    }
  }



  if(world_rank==0)
  {
    double walltime = get_rtc() - walltime_0;
    std::cout<<" WL-LSMS finished in "<<walltime<<" seconds.\n";
    std::cout<<" Monte-Carlo steps / walltime = "
             <<double(stepCount)/walltime<<"/sec\n";
  }

#ifdef USE_PAPI
  PAPI_stop_counters(papi_values,hw_counters);
  papi_values[hw_counters  ] = PAPI_get_real_cyc()-papi_real_cyc_0;
  papi_values[hw_counters+1] = PAPI_get_real_usec()-papi_real_usec_0;
  papi_values[hw_counters+2] = PAPI_get_virt_cyc()-papi_virt_cyc_0;
  papi_values[hw_counters+3] = PAPI_get_virt_usec()-papi_virt_usec_0;
  long long accumulated_counters[NUM_PAPI_EVENTS+4];
/*
  for(int i=0; i<hw_counters; i++)
  {
  printline(ttos(papi_event_name[i])+" = "+ttos(papi_values[i]),
  std::cout,parameters.myrankWorld);
  }
  printline("PAPI real cycles : "+ttos(papi_values[hw_counters]),
  std::cout,parameters.myrankWorld);
  printline("PAPI real usecs : "+ttos(papi_values[hw_counters+1]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user cycles : "+ttos(papi_values[hw_counters+2]),
  std::cout,parameters.myrankWorld);
  printline("PAPI user usecs : "+ttos(papi_values[hw_counters+3]),
  std::cout,parameters.myrankWorld);
*/
  
  //MPI_Reduce(papi_values,accumulated_counters,hw_counters+4,
  //           MPI_LONG,MPI_SUM,0,MPI_COMM_WORLD);

  shmem_long_sum_to_all(accumulated_counters, papi_values, hw_counters+4,
      comm.pestart, comm.logPE_stride, comm.size, pWrk_i, pSync2);



  if(world_rank==0)
  {
    for(int i=0; i<hw_counters; i++)
    {
      std::cout<<"Accumulated: "<<(papi_event_name[i])<<" = "<<(accumulated_counters[i])<<"\n";
    }
    std::cout<<"PAPI accumulated real cycles : "<<(accumulated_counters[hw_counters])<<"\n";
    std::cout<<"PAPI accumulated user cycles : "<<(accumulated_counters[hw_counters+2])<<"\n";
    double gflops_papi = ((double)accumulated_counters[1])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_double = ((double)accumulated_counters[2])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gflops_hw_single = ((double)accumulated_counters[3])/
      (1000.0*(double)papi_values[hw_counters+1]);
    double gips = ((double)accumulated_counters[0])/(1000.0*(double)papi_values[hw_counters+1]);
    std::cout<<"PAPI_FP_OPS real GFLOP/s : "<<(gflops_papi)<<"\n";
    std::cout<<"PAPI hw double real GFLOP/s : "<<(gflops_hw_double)<<"\n";
    std::cout<<"PAPI hw single real GFLOP/s : "<<(gflops_hw_single)<<"\n";
    std::cout<<"PAPI real GINST/s : "<<(gips)<<"\n";
  }
#endif


  //MPI_Finalize();
  return 0;
}
예제 #30
0
void _PERM_DR(const _permmap* const pm, _permdata* const pd, const int scatter, _array_fnc dst, _array_fnc src) {
  const int one = 1;
  const int eltsize = pd->eltsize;
  int i;
  int * const restrict ldecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr lcnt");
  int * const restrict rdecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr rcnt");
  char * const restrict * const restrict ldata = pd->ldata;
  char * const restrict * const restrict rdata = pd->rdata;
  char * restrict * const restrict rptr = pm->rptr;
  int * restrict rflag = pm->rflag;
  char* addr;

#ifdef _SHMEM_PERMUTE_DEBUG
  printf("DR start %d\n", _INDEX);
  fflush(stdout);
  sleep(5);
#endif
  for (i=0; i<_PROCESSORS; i++) {
    ldecnt[i] = (scatter ? _PERM_LCNT(pm, i) : _PERM_RCNT(pm, i)) * eltsize;
    rdecnt[i] = (scatter ? _PERM_RCNT(pm, i) : _PERM_LCNT(pm, i)) * eltsize;
  }
  for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1;
       i != _INDEX;
       i = (i == _PROCESSORS - 1) ? 0 : i++) {
    if (rdecnt[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("DR %d sending addr, %d, to %d\n", _INDEX, (int)&(rdata[i]), i);
  fflush(stdout);
  sleep(5);
#endif
      rflag[i] = 0;
      shmem_put((void*)&(rptr[_INDEX]), (void*)&(rdata[i]), 1, i);
    }
  }
  for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1;
       i != _INDEX;
       i = (i == 0) ? _PROCESSORS-1 : i--) {
    if (ldecnt[i] > 0) {
      shmem_wait((long*)&(rptr[i]), 0);
      addr = rptr[i];
      rptr[i] = 0;
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("DR %d waiting addr, %d, from %d\n", _INDEX, (int)addr, i);
  fflush(stdout);
  sleep(5);
#endif
      shmem_putmem(addr, ldata[i], ldecnt[i], i);
    }
  }
  shmem_fence();
  for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1;
       i != _INDEX;
       i = (i == 0) ? _PROCESSORS-1 : i--) {
    if (ldecnt[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("DR %d sending flag to %d\n", _INDEX, i);
  fflush(stdout);
  sleep(5);
#endif
      shmem_int_put(&(rflag[_INDEX]), &one, 1, i);
    }
  }
  if (ldecnt[_INDEX] > 0) {
    memcpy(rdata[_INDEX], ldata[_INDEX], ldecnt[_INDEX]);
  }
  _zfree(ldecnt, "perm dr lcnt");
  _zfree(rdecnt, "perm dr rcnt");
  pd->count = -1;
}