示例#1
0
void
flat_tree (STREAM_TYPE * target, STREAM_TYPE * source, int nreduce)
{
  STREAM_TYPE *tmptrg;
  tmptrg = (STREAM_TYPE *) malloc (nreduce * sizeof (STREAM_TYPE));

  // only one PE needs to access this section
  if (_world_rank == 0)
    {
      /* First, finish gathering */
      for (int n = 0; n < _world_size; n++)
	{
	  shmem_getmem (tmptrg, source, nreduce * sizeof (STREAM_TYPE), n);
	  /* Compute max */
	  for (int k = 0; k < nreduce; k++)
	    {
	      tmptrg[k] = REDUCE_MAX (tmptrg[k], source[k]);
	    }
	}
      /* Then, broadcast results */
      for (int n = 0; n < _world_size; n++)
	{
	  shmem_putmem (target, tmptrg, nreduce * sizeof (STREAM_TYPE), n);
	}
    }
  shmem_barrier_all ();

  free (tmptrg);

  return;
}
示例#2
0
文件: msgrate.c 项目: jpdoyle/SOS
static void
test_prepost(void)
{
    int i, j, k;

    tmp = 0;
    total = 0;

    shmem_barrier_all();

    for (i = 0 ; i < niters - 1 ; ++i) {
        cache_invalidate();

        shmem_barrier_all();

        tmp = timer();
        for (j = 0 ; j < npeers ; ++j) {
            for (k = 0 ; k < nmsgs ; ++k) {
                shmem_putmem(recv_buf + (nbytes * (k + j * nmsgs)), 
                             send_buf + (nbytes * (k + j * nmsgs)), 
                             nbytes, send_peers[npeers - j - 1]);
            }
        }
        shmem_quiet();
        shmem_short_wait((short*) (recv_buf + (nbytes * ((nmsgs - 1) + (npeers - 1) * nmsgs))), 0);
        total += (timer() - tmp);
        memset(recv_buf, 0, npeers * nmsgs * nbytes);
    }

    shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, world_size, reduce_pWrk, reduce_pSync);
    display_result("pre-post", (niters * npeers * nmsgs * 2) / (tmp / world_size));
}
示例#3
0
文件: shmem.c 项目: carriercomm/ix
void SendData(ArgStruct *p)
{
   if(p->bufflen%8==0)
      shmem_put64(p->s_ptr,p->s_ptr,p->bufflen/8,p->prot.nbor);
   else
      shmem_putmem(p->s_ptr,p->s_ptr,p->bufflen,p->prot.nbor);
}
示例#4
0
void
benchmark (struct pe_vars v, char * msg_buffer)
{
    static double pwrk[_SHMEM_REDUCE_SYNC_SIZE];
    static long psync[_SHMEM_BCAST_SYNC_SIZE];
    static double mr, mr_sum;
    unsigned long size, i;

    memset(psync, _SHMEM_SYNC_VALUE, sizeof(long[_SHMEM_BCAST_SYNC_SIZE]));

    /*
     * Warmup
     */
    if (v.me < v.pairs) {
        for (i = 0; i < (ITERS_LARGE * MAX_MSG_SZ); i += MAX_MSG_SZ) {
            shmem_putmem(&msg_buffer[i], &msg_buffer[i], MAX_MSG_SZ, v.nxtpe);
        }
    }
    
    shmem_barrier_all();

    /*
     * Benchmark
     */
    for (size = 1; size <= MAX_MSG_SZ; size <<= 1) {
        i = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE;

        mr = message_rate(v, msg_buffer, size, i);
        shmem_double_sum_to_all(&mr_sum, &mr, 1, 0, 0, v.npes, pwrk, psync);
        print_message_rate(v.me, size, mr_sum);
    }
}
示例#5
0
double
message_rate (struct pe_vars v, char * buffer, int size, int iterations)
{
    int64_t begin, end; 
    int i, offset;

    /*
     * Touch memory
     */
    memset(buffer, size, MAX_MSG_SZ * ITERS_LARGE);

    shmem_barrier_all();

    if (v.me < v.pairs) {
        begin = TIME();

        for (i = 0, offset = 0; i < iterations; i++, offset += size) {
            shmem_putmem(&buffer[offset], &buffer[offset], size, v.nxtpe);
        }

        end = TIME();

        return ((double)iterations * 1e6) / ((double)end - (double)begin);
    }

    return 0;
}
void
benchmark (struct pe_vars v, union data_types *msg_buffer)
{

    srand(v.me);

    /*
     * Warmup with puts
     */
    if (v.me < v.pairs) {
        unsigned long i;

        for (i = 0; i < ITERATIONS; i++) {
            shmem_putmem(&msg_buffer[i].int_type, &msg_buffer[i].int_type,
                    sizeof(int), v.nxtpe);
        }
    }
   
    /*
     * Performance with atomics
     */ 
    benchmark_fadd(v, msg_buffer, ITERATIONS);
    benchmark_finc(v, msg_buffer, ITERATIONS);
    benchmark_add(v, msg_buffer, ITERATIONS);
    benchmark_inc(v, msg_buffer, ITERATIONS);
    benchmark_cswap(v, msg_buffer, ITERATIONS);
    benchmark_swap(v, msg_buffer, ITERATIONS);
    
    benchmark_fadd_longlong(v, msg_buffer, ITERATIONS);
    benchmark_finc_longlong(v, msg_buffer, ITERATIONS);
    benchmark_add_longlong(v, msg_buffer, ITERATIONS);
    benchmark_inc_longlong(v, msg_buffer, ITERATIONS);
    benchmark_cswap_longlong(v, msg_buffer, ITERATIONS);
    benchmark_swap_longlong(v, msg_buffer, ITERATIONS);
}
示例#7
0
void sendTmats(LSMSCommunication &comm, LocalTypeInfo &local)
{

  void * temp_buff=(void*)shmalloc(2*local.lDimTmatStore*double_size);
  for(int i=0; i<comm.numTmatTo; i++)
  {
    int to=comm.tmatTo[i].remoteNode;
    for(int j=0; j<comm.tmatTo[i].numTmats; j++)
    {
      // printf("Node %d: send tmat %d to %d\n",comm.rank,comm.tmatTo[i].globalIdx[j],to);
      /*
      MPI_Isend(&local.tmatStore(0,comm.tmatTo[i].tmatStoreIdx[j]),2*local.lDimTmatStore,
                MPI_DOUBLE,to,comm.tmatTo[i].globalIdx[j],comm.comm,
                &comm.tmatTo[i].communicationRequest[j]);
      */
      // Assuming comm.tmatTo[i].numTmats == comm.tmatFrom[i].numTmats
      // If not ... check the iteration space mapping between 
      //    the receivers (above) and senders (here)
      memcpy(temp_buff,&local.tmatStore(0,comm.tmatFrom[i].tmatStoreIdx[j]),2*local.lDimTmatStore*double_size);
      // Note: there is a trade-off here: either reuse the same 
      // buffer and enjoy reduced cache misses (and potential 
      // power savings due to less data xfers between memories) 
      // or enjoy non-blocking communication with different buffers
      // but at the cost of multiple cache misses (and potentially
      // high power consumption)
      shmem_putmem(temp_buff, &local.tmatStore(0,comm.tmatTo[i].tmatStoreIdx[j]), 2*local.lDimTmatStore*double_size, to);
    }
  }
}
static int test_item1(void)
{
    int rc = TC_PASS;
    int num_proc = 0;
    int my_proc = 0;
    int peer;
    int size;
    char *buf;
    int test_byte;
    int max_heap_size_per_proc;

    num_proc = _num_pes();
    my_proc = _my_pe();
    peer = (my_proc + 1) % num_proc;

    max_heap_size_per_proc = 1L << (sys_log2((memheap_size() * HEAP_USAGE_PERCENT)/ num_proc) - 1);
    max_heap_size_per_proc = (max_heap_size_per_proc > MAX_SIZE) ? MAX_SIZE : max_heap_size_per_proc;
    buf = (char *)shmalloc(max_heap_size_per_proc * num_proc);
    if (!buf)
    {
        log_error(OSH_TC, "shmalloc(%d)\n", max_heap_size_per_proc * num_proc);
        return TC_SETUP_FAIL;
    }

    size = 1L << sys_log2(num_proc);
    size = ((size - 2) > 0) ? size : 4;
    log_debug(OSH_TC, "%d: buf = %p size=%d\n", my_proc, buf, size);
    for (; size <= max_heap_size_per_proc; size *=2)
    {
        memset(buf + max_heap_size_per_proc * my_proc, 1 + my_proc % (size - 2), max_heap_size_per_proc);
        log_debug(OSH_TC, "\n%d: b4 barrier size = %d\n", my_proc, size);
        shmem_barrier_all();
        log_debug(OSH_TC, "%d: b4 putmem size = %d  %p -> %p\n", my_proc, size,
                buf+max_heap_size_per_proc*my_proc, buf + max_heap_size_per_proc * my_proc);
        shmem_putmem(buf+max_heap_size_per_proc*my_proc, buf+max_heap_size_per_proc*my_proc, size, peer);
        shmem_fence();
        test_byte = 0;
        log_debug(OSH_TC, "%d: b4 getmem size = %d\n %p <- %p ", my_proc, size,
                &test_byte,
                buf+max_heap_size_per_proc*peer + size - 1
                );
        shmem_getmem(&test_byte, buf+max_heap_size_per_proc*my_proc + size - 1, 1, peer);

        log_debug(OSH_TC, "%d: after getmem size = %d result=%x\n", my_proc, size, test_byte);
        if (test_byte != 1 + my_proc % (size-2))
        {
            log_error(OSH_TC, "fence failed at size %d got = %x expected = %x\n", size, test_byte, 1 + my_proc % (size-2));
            rc = TC_FAIL;
        }

    }

    shfree(buf);
    log_debug(OSH_TC, rc == TC_PASS? "passed" : "failed");
    return rc;
}
示例#9
0
文件: msgrate.c 项目: jpdoyle/SOS
static void
test_one_way(void)
{
    int i, k;
    int pe_size  = world_size;

    tmp = 0;
    total = 0;

    shmem_barrier_all();

    if (world_size % 2 == 1) {
        pe_size = world_size - 1;
    }

    if (!(world_size % 2 == 1 && rank == (world_size - 1))) {
        if (rank < world_size / 2) {
            for (i = 0 ; i < niters ; ++i) {
                cache_invalidate();

                shmem_barrier(0, 0, pe_size, barrier_pSync);

                tmp = timer();
                for (k = 0 ; k < nmsgs ; ++k) {
                    shmem_putmem(recv_buf + (nbytes * k), 
                                 send_buf + (nbytes * k), 
                                 nbytes, rank + (world_size / 2));
                }
                shmem_quiet();
                total += (timer() - tmp);
            }
        } else {
            for (i = 0 ; i < niters ; ++i) {
                cache_invalidate();

                shmem_barrier(0, 0, pe_size, barrier_pSync);

                tmp = timer();
                shmem_short_wait((short*) (recv_buf + (nbytes * (nmsgs - 1))), 0);
                total += (timer() - tmp);
                memset(recv_buf, 0, npeers * nmsgs * nbytes);
            }
        }

        shmem_double_sum_to_all(&tmp, &total, 1, 0, 0, pe_size, reduce_pWrk, reduce_pSync);
        display_result("single direction", (niters * nmsgs) / (tmp / world_size));
    }

    shmem_barrier_all();
}
示例#10
0
void
flat_tree (STREAM_TYPE * target, STREAM_TYPE * source, int nreduce)
{
  STREAM_TYPE *tmptrg;
  STREAM_TYPE *write_to;
  /* use temp target in case source/target overlap/same */
  tmptrg = (STREAM_TYPE *) malloc (nreduce * sizeof (STREAM_TYPE));
  write_to = tmptrg;

  for (int j = 0; j < nreduce; j += 1)
    {
      write_to[j] = source[j];
    }
  shmem_barrier_all ();
  // only one PE needs to access this section
  if (_world_rank == 0)
    {
      /* First, finish gathering */
      for (int n = 0; n < _world_size; n++)
	{
	  shmem_getmem (tmptrg, source, nreduce * sizeof (STREAM_TYPE), n);
	  /* Compute max */
	  for (int k = 0; k < nreduce; k++)
	    {
	      write_to[k] = REDUCE_MAX (write_to[k], source[k]);
	    }
	}
      /* Then, broadcast results */
      for (int n = 0; n < _world_size; n++)
	{
	  shmem_putmem (target, tmptrg, nreduce * sizeof (STREAM_TYPE), n);
	}
    }
  shmem_barrier_all ();

  free (tmptrg);
  tmptrg = NULL;

  return;
}
示例#11
0
/**
 * \brief Check to make sure the test is correct, SHMEM
 * \param tst Struct that tells the number of cycles and stages to run the test.
 * \param m Struct that holds the results of the test.
 */
void bit_SHMEM_test(test_p tst, measurement_p m) {
#ifdef SHMEM
	buffer_t *abuf, *bbuf, *cbuf;
	int i, j, k, icycle, istage, partner_rank;
	unsigned char pattern;
	abuf = comm_newbuffer(m->buflen);							/* set up exchange buffers */
	bbuf = comm_newbuffer(m->buflen);
	cbuf = comm_newbuffer(m->buflen);
	for (icycle = 0; icycle < tst->num_cycles; icycle++) {					/* multiple cycles repeat the test */
		for (istage = 0; istage < tst->num_stages; istage++) {				/* step through the stage schedule */
			partner_rank = my_rank ^ istage;					/* who's my partner for this stage? */
			shmem_barrier_all();
			if ((partner_rank < num_ranks) && (partner_rank != my_rank)) {		/* valid pair? proceed with test */
				for (k=0x00; k< 0x100; k++) {		/* try each byte patter */
					pattern=k;
					for (i=0; i<m->buflen; i++) ((unsigned char *)(abuf->data))[i]=pattern;
					shmem_putmem(bbuf->data, abuf->data, m->buflen, partner_rank);
					shmem_fence();
					shmem_getmem(cbuf->data, bbuf->data, m->buflen, partner_rank);
					for (i=0; i<m->buflen; i++) {
						if (((unsigned char *)(cbuf->data))[i] != pattern) {
							printf("DATA ERROR DETECTED:   node:%20s   rank:%10d"
								"   pattern:0x%2x   buflen:%10d   position:%10d\n",
								nodename, my_rank, (int)pattern, m->buflen, i);
						} /* if mismatch */
					} /* for buflen */
				} /* for pattern */
			} /* if valid pairing */
		} /* for istage */
	} /* for icycle */
	shmem_barrier_all();
	comm_freebuffer(cbuf);
	comm_freebuffer(bbuf);
	comm_freebuffer(abuf);
#endif
	return;
}
int main(int argc, char **argv)
{
  const int ITER_CNT = 100;
  const long int MAX_MSG_SIZE = 1048576;
  int* source_addr;
  int peer;
  long int i=0, buff_size; 
  int j=0;
  long long int start_time, stop_time, res;
  double time;

  shmem_init();

  int pe_id = shmem_my_pe();
  source_addr = (int*) shmem_malloc(MAX_MSG_SIZE);

  if(pe_id == 1) {
      if(shmem_n_pes()!=4)
      	fprintf(stderr,"Num PEs should be ==4");
      printf("#Message Cnt;Time(s);MR(msgs/sec)\n");
  }

  if (pe_id==1)
	  peer = 3;
  else if(pe_id==3)
	  peer = 1;
  get_rtc_res_(&res);

  for (i = 0; i < SHMEM_BARRIER_SYNC_SIZE; i += 1){
          pSync[i] = SHMEM_SYNC_VALUE;
  }

  /* Collective operation: Implicit barrier on return from attach */
  shmem_barrier_all();
  if(pe_id == 1 || pe_id == 3) {

  	for(buff_size=1; buff_size<=MAX_MSG_SIZE; buff_size*=2) {
	    isdone=0;
  	    shmem_barrier(1,1,2,pSync);
  	    get_rtc_(&start_time);
  	    for(j=1;j<=ITER_CNT;j++) {
		    shmem_putmem(source_addr, source_addr, buff_size, peer);
  	            shmem_quiet();
		    shmem_int_put(&isdone, &j, 1, peer);
  	            shmem_quiet();
		    shmem_int_wait(&isdone,j-1);
		    shmem_putmem(source_addr, source_addr, buff_size, peer);
  	            shmem_quiet();
  	    }
  	    shmem_barrier(1,1,2,pSync);
  	    get_rtc_(&stop_time);
  	    time = (stop_time - start_time)*1.0/(double)res/ITER_CNT;
  	    if(pe_id == 1) {
  	   	 printf("%20ld;%20.12f;%20.12f\n", 
  	                buff_size, time, (double)buff_size/time);
  	    }
  	    fflush(stdout);
  	}
  }

  shmem_barrier_all();
  shmem_finalize();

}
int
main (int argc, char **argv)
{
    int i;
    int nextpe;
    int me, npes;
    int success1, success2, success3, success4, success5, success6, success7,
        success8;

    short src1[N];
    int src2[N];
    long src3[N];
    long double src4[N];
    long long src5[N];
    double src6[N];
    float src7[N];
    char *src8;
    short src9;
    int src10;
    long src11;
    double src12;
    float src13;

    short *dest1;
    int *dest2;
    long *dest3;
    long double *dest4;
    long long *dest5;
    double *dest6;
    float *dest7;
    char *dest8;
    short *dest9;
    int *dest10;
    long *dest11;
    double *dest12;
    float *dest13;


    shmem_init ();
    me = shmem_my_pe ();
    npes = shmem_n_pes ();

    if (npes > 1) {

        success1 = 0;
        success2 = 0;
        success3 = 0;
        success4 = 0;
        success5 = 0;
        success6 = 0;
        success7 = 0;
        success8 = 0;
        src8 = (char *) malloc (N * sizeof (char));

        for (i = 0; i < N; i += 1) {
            src1[i] = (short) me;
            src2[i] = me;
            src3[i] = (long) me;
            src4[i] = (long double) me;
            src5[i] = (long long) me;
            src6[i] = (double) me;
            src7[i] = (float) me;
            src8[i] = (char) me;
        }
        src9 = (short) me;
        src10 = me;
        src11 = (long) me;
        src12 = (double) me;
        src13 = (float) me;


        dest1 = (short *) shmem_malloc (N * sizeof (*dest1));
        dest2 = (int *) shmem_malloc (N * sizeof (*dest2));
        dest3 = (long *) shmem_malloc (N * sizeof (*dest3));
        dest4 = (long double *) shmem_malloc (N * sizeof (*dest4));
        dest5 = (long long *) shmem_malloc (N * sizeof (*dest5));
        dest6 = (double *) shmem_malloc (N * sizeof (*dest6));
        dest7 = (float *) shmem_malloc (N * sizeof (*dest7));
        dest8 = (char *) shmem_malloc (4 * sizeof (*dest8));
        dest9 = (short *) shmem_malloc (sizeof (*dest9));
        dest10 = (int *) shmem_malloc (sizeof (*dest10));
        dest11 = (long *) shmem_malloc (sizeof (*dest11));
        dest12 = (double *) shmem_malloc (sizeof (*dest12));
        dest13 = (float *) shmem_malloc (sizeof (*dest13));

        for (i = 0; i < N; i += 1) {
            dest1[i] = -9;
            dest2[i] = -9;
            dest3[i] = -9;
            dest4[i] = -9;
            dest5[i] = -9;
            dest6[i] = -9;
            dest7[i] = -9.0;
            dest8[i] = -9;
        }
        *dest9 = -9;
        *dest10 = -9;
        *dest11 = -9;
        *dest12 = -9;
        *dest13 = -9.0;

        nextpe = (me + 1) % npes;

        /* Testing shmem_short_put, shmem_int_put, shmem_long_put,
           shmem_longdouble_put, shmem_longlong_put, shmem_double_put,
           shmem_float_put, shmem_putmem */
        shmem_barrier_all ();

        shmem_short_put (dest1, src1, N, nextpe);
        shmem_int_put (dest2, src2, N, nextpe);
        shmem_long_put (dest3, src3, N, nextpe);
        shmem_longdouble_put (dest4, src4, N, nextpe);
        shmem_longlong_put (dest5, src5, N, nextpe);
        shmem_double_put (dest6, src6, N, nextpe);
        shmem_float_put (dest7, src7, N, nextpe);
        shmem_putmem (dest8, src8, N * sizeof (char), nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            for (i = 0; i < N; i += 1) {
                if (dest1[i] != (npes - 1)) {
                    success1 = 1;
                }
                if (dest2[i] != (npes - 1)) {
                    success2 = 1;
                }
                if (dest3[i] != (npes - 1)) {
                    success3 = 1;
                }
                if (dest4[i] != (npes - 1)) {
                    success4 = 1;
                }
                if (dest5[i] != (npes - 1)) {
                    success5 = 1;
                }
                if (dest6[i] != (npes - 1)) {
                    success6 = 1;
                }
                if (dest7[i] != (npes - 1)) {
                    success7 = 1;
                }
                if (dest8[i] != (npes - 1)) {
                    success8 = 1;
                }
            }

            if (success1 == 0)
                printf ("Test shmem_short_put: Passed\n");
            else
                printf ("Test shmem_short_put: Failed\n");
            if (success2 == 0)
                printf ("Test shmem_int_put: Passed\n");
            else
                printf ("Test shmem_int_put: Failed\n");
            if (success3 == 0)
                printf ("Test shmem_long_put: Passed\n");
            else
                printf ("Test shmem_long_put: Failed\n");
            if (success4 == 0)
                printf ("Test shmem_longdouble_put: Passed\n");
            else
                printf ("Test shmem_longdouble_put: Failed\n");
            if (success5 == 0)
                printf ("Test shmem_longlong_put: Passed\n");
            else
                printf ("Test shmem_longlong_put: Failed\n");
            if (success6 == 0)
                printf ("Test shmem_double_put: Passed\n");
            else
                printf ("Test shmem_double_put: Failed\n");
            if (success7 == 0)
                printf ("Test shmem_float_put: Passed\n");
            else
                printf ("Test shmem_float_put: Failed\n");
            if (success8 == 0)
                printf ("Test shmem_putmem: Passed\n");
            else
                printf ("Test shmem_putmem: Failed\n");

        }
        shmem_barrier_all ();

        /* Testing shmem_put32, shmem_put64, shmem_put128 */
        if (sizeof (int) == 4) {
            for (i = 0; i < N; i += 1) {
                dest2[i] = -9;
                dest3[i] = -9;
                dest4[i] = -9;
            }
            success2 = 0;
            success3 = 0;
            success4 = 0;

            shmem_barrier_all ();

            shmem_put32 (dest2, src2, N, nextpe);
            shmem_put64 (dest3, src3, N, nextpe);
            shmem_put128 (dest4, src4, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N; i += 1) {
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }
                    if (dest4[i] != (npes - 1)) {
                        success4 = 1;
                    }
                }
                if (success2 == 0)
                    printf ("Test shmem_put32: Passed\n");
                else
                    printf ("Test shmem_put32: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_put64: Passed\n");
                else
                    printf ("Test shmem_put64: Failed\n");

                if (success4 == 0)
                    printf ("Test shmem_put128: Passed\n");
                else
                    printf ("Test shmem_put128: Failed\n");
            }
        }
        else if (sizeof (int) == 8) {
            for (i = 0; i < N; i += 1) {
                dest1[i] = -9;
                dest2[i] = -9;
                dest3[i] = -9;
            }
            success1 = 0;
            success2 = 0;
            success3 = 0;

            shmem_barrier_all ();

            shmem_put32 (dest1, src1, N, nextpe);
            shmem_put64 (dest2, src2, N, nextpe);
            shmem_put128 (dest3, src3, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N; i += 1) {
                    if (dest1[i] != (npes - 1)) {
                        success1 = 1;
                    }
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }

                }
                if (success1 == 0)
                    printf ("Test shmem_put32: Passed\n");
                else
                    printf ("Test shmem_put32: Failed\n");
                if (success2 == 0)
                    printf ("Test shmem_put64: Passed\n");
                else
                    printf ("Test shmem_put64: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_put128: Passed\n");
                else
                    printf ("Test shmem_put128: Failed\n");
            }
        }

        /* Testing shmem_iput32, shmem_iput64, shmem_iput128 */
        shmem_barrier_all ();
        if (sizeof (int) == 4) {
            for (i = 0; i < N; i += 1) {
                dest2[i] = -9;
                dest3[i] = -9;
                dest4[i] = -9;
            }
            success2 = 0;
            success3 = 0;
            success4 = 0;

            shmem_barrier_all ();

            shmem_iput32 (dest2, src2, 1, 2, N, nextpe);
            shmem_iput64 (dest3, src3, 1, 2, N, nextpe);
            shmem_iput128 (dest4, src4, 1, 2, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N / 2; i += 1) {
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }
                    if (dest4[i] != (npes - 1)) {
                        success4 = 1;
                    }
                }
                if (success2 == 0)
                    printf ("Test shmem_iput32: Passed\n");
                else
                    printf ("Test shmem_iput32: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_iput64: Passed\n");
                else
                    printf ("Test shmem_iput64: Failed\n");

                if (success4 == 0)
                    printf ("Test shmem_iput128: Passed\n");
                else
                    printf ("Test shmem_iput128: Failed\n");
            }
        }
        else if (sizeof (int) == 8) {
            for (i = 0; i < N; i += 1) {
                dest1[i] = -9;
                dest2[i] = -9;
                dest3[i] = -9;
            }
            success1 = 0;
            success2 = 0;
            success3 = 0;

            shmem_barrier_all ();

            shmem_iput32 (dest1, src1, 1, 2, N, nextpe);
            shmem_iput64 (dest2, src2, 1, 2, N, nextpe);
            shmem_iput128 (dest3, src3, 1, 2, N, nextpe);

            shmem_barrier_all ();

            if (me == 0) {
                for (i = 0; i < N / 2; i += 1) {
                    if (dest1[i] != (npes - 1)) {
                        success1 = 1;
                    }
                    if (dest2[i] != (npes - 1)) {
                        success2 = 1;
                    }
                    if (dest3[i] != (npes - 1)) {
                        success3 = 1;
                    }

                }
                if (success1 == 0)
                    printf ("Test shmem_iput32: Passed\n");
                else
                    printf ("Test shmem_iput32: Failed\n");
                if (success2 == 0)
                    printf ("Test shmem_iput64: Passed\n");
                else
                    printf ("Test shmem_iput64: Failed\n");

                if (success3 == 0)
                    printf ("Test shmem_iput128: Passed\n");
                else
                    printf ("Test shmem_iput128: Failed\n");
            }
        }

        /* Testing shmem_short_iput, shmem_int_iput, shmem_long_iput,
           shmem_double_iput, shmem_float_iput */
        for (i = 0; i < N; i += 1) {
            dest1[i] = -9;
            dest2[i] = -9;
            dest3[i] = -9;
            dest6[i] = -9;
            dest7[i] = -9;
        }
        success1 = 0;
        success2 = 0;
        success3 = 0;
        success6 = 0;
        success7 = 0;

        shmem_barrier_all ();

        shmem_short_iput (dest1, src1, 1, 2, N, nextpe);
        shmem_int_iput (dest2, src2, 1, 2, N, nextpe);
        shmem_long_iput (dest3, src3, 1, 2, N, nextpe);
        shmem_double_iput (dest6, src6, 1, 2, N, nextpe);
        shmem_float_iput (dest7, src7, 1, 2, N, nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            for (i = 0; i < N / 2; i += 1) {
                if (dest1[i] != (npes - 1)) {
                    success1 = 1;
                }
                if (dest2[i] != (npes - 1)) {
                    success2 = 1;
                }
                if (dest3[i] != (npes - 1)) {
                    success3 = 1;
                }
                if (dest6[i] != (npes - 1)) {
                    success6 = 1;
                }
                if (dest7[i] != (npes - 1)) {
                    success7 = 1;
                }
            }

            if (success1 == 0)
                printf ("Test shmem_short_iput: Passed\n");
            else
                printf ("Test shmem_short_iput: Failed\n");
            if (success2 == 0)
                printf ("Test shmem_int_iput: Passed\n");
            else
                printf ("Test shmem_int_iput: Failed\n");
            if (success3 == 0)
                printf ("Test shmem_long_iput: Passed\n");
            else
                printf ("Test shmem_long_iput: Failed\n");
            if (success6 == 0)
                printf ("Test shmem_double_iput: Passed\n");
            else
                printf ("Test shmem_double_iput: Failed\n");
            if (success7 == 0)
                printf ("Test shmem_float_iput: Passed\n");
            else
                printf ("Test shmem_float_iput: Failed\n");

        }


        /* Testing shmem_double_p, shmem_float_p, shmem_int_p, shmem_long_p,
           shmem_short_p */
        shmem_barrier_all ();

        shmem_short_p (dest9, src9, nextpe);
        shmem_int_p (dest10, src10, nextpe);
        shmem_long_p (dest11, src11, nextpe);
        shmem_double_p (dest12, src12, nextpe);
        shmem_float_p (dest13, src13, nextpe);

        shmem_barrier_all ();

        if (me == 0) {
            if (*dest9 == (npes - 1))
                printf ("Test shmem_short_p: Passed\n");
            else
                printf ("Test shmem_short_p: Failed\n");
            if (*dest10 == (npes - 1))
                printf ("Test shmem_int_p: Passed\n");
            else
                printf ("Test shmem_int_p: Failed\n");
            if (*dest11 == (npes - 1))
                printf ("Test shmem_long_p: Passed\n");
            else
                printf ("Test shmem_long_p: Failed\n");
            if (*dest12 == (npes - 1))
                printf ("Test shmem_double_p: Passed\n");
            else
                printf ("Test shmem_double_p: Failed\n");
            if (*dest13 == (npes - 1))
                printf ("Test shmem_float_p: Passed\n");
            else
                printf ("Test shmem_float_p: Failed\n");


        }

        shmem_barrier_all ();

        shmem_free (dest1);
        shmem_free (dest2);
        shmem_free (dest3);
        shmem_free (dest4);
        shmem_free (dest5);
        shmem_free (dest6);
        shmem_free (dest7);
        shmem_free (dest8);
        shmem_free (dest9);
        shmem_free (dest10);
        shmem_free (dest11);
        shmem_free (dest12);
        shmem_free (dest13);

    }
    else {
        printf ("Number of PEs must be > 1 to test shmem put, test skipped\n");
    }

    shmem_finalize ();

    return 0;
}
示例#14
0
void _PERM_DR(const _permmap* const pm, _permdata* const pd, const int scatter, _array_fnc dst, _array_fnc src) {
  const int one = 1;
  const int eltsize = pd->eltsize;
  int i;
  int * const restrict ldecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr lcnt");
  int * const restrict rdecnt = (int*)_zmalloc(_PROCESSORS*sizeof(int), "perm dr rcnt");
  char * const restrict * const restrict ldata = pd->ldata;
  char * const restrict * const restrict rdata = pd->rdata;
  char * restrict * const restrict rptr = pm->rptr;
  int * restrict rflag = pm->rflag;
  char* addr;

#ifdef _SHMEM_PERMUTE_DEBUG
  printf("DR start %d\n", _INDEX);
  fflush(stdout);
  sleep(5);
#endif
  for (i=0; i<_PROCESSORS; i++) {
    ldecnt[i] = (scatter ? _PERM_LCNT(pm, i) : _PERM_RCNT(pm, i)) * eltsize;
    rdecnt[i] = (scatter ? _PERM_RCNT(pm, i) : _PERM_LCNT(pm, i)) * eltsize;
  }
  for (i = (_INDEX == _PROCESSORS - 1) ? 0 : _INDEX+1;
       i != _INDEX;
       i = (i == _PROCESSORS - 1) ? 0 : i++) {
    if (rdecnt[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("DR %d sending addr, %d, to %d\n", _INDEX, (int)&(rdata[i]), i);
  fflush(stdout);
  sleep(5);
#endif
      rflag[i] = 0;
      shmem_put((void*)&(rptr[_INDEX]), (void*)&(rdata[i]), 1, i);
    }
  }
  for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1;
       i != _INDEX;
       i = (i == 0) ? _PROCESSORS-1 : i--) {
    if (ldecnt[i] > 0) {
      shmem_wait((long*)&(rptr[i]), 0);
      addr = rptr[i];
      rptr[i] = 0;
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("DR %d waiting addr, %d, from %d\n", _INDEX, (int)addr, i);
  fflush(stdout);
  sleep(5);
#endif
      shmem_putmem(addr, ldata[i], ldecnt[i], i);
    }
  }
  shmem_fence();
  for (i = (_INDEX == 0) ? _PROCESSORS-1 : _INDEX-1;
       i != _INDEX;
       i = (i == 0) ? _PROCESSORS-1 : i--) {
    if (ldecnt[i] > 0) {
#ifdef _SHMEM_PERMUTE_DEBUG
      printf("DR %d sending flag to %d\n", _INDEX, i);
  fflush(stdout);
  sleep(5);
#endif
      shmem_int_put(&(rflag[_INDEX]), &one, 1, i);
    }
  }
  if (ldecnt[_INDEX] > 0) {
    memcpy(rdata[_INDEX], ldata[_INDEX], ldecnt[_INDEX]);
  }
  _zfree(ldecnt, "perm dr lcnt");
  _zfree(rdecnt, "perm dr rcnt");
  pd->count = -1;
}
示例#15
0
void communicateSingleAtomData(LSMSCommunication &comm, int from, int to, int &local_id, AtomData &atom, int tag)
{
  //The buffers used in this func are pre-allocated within initializeCommunication() of size 's' below 
  //int s=sizeof(AtomData)+sizeof(Real)*(2*3*MAXPTS+2*MAXCORE)+sizeof(int)*3*2*MAXCORE+sizeof(int);
  // 304 bytes transferred in each of the ITER_MAX iterations
  const int maxPts=MAXPTS;
  const int maxCore=MAXCORE;
  int t,i;
  static int count=0;
  const int ITER_MAX=1;
  int sec_id;

  if(comm.comm.rank==from)
  {

   for (i=0;i<ITER_MAX;i++){
    int pos=0;

    memcpy(&p2p_buf[pos],&local_id,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.jmt,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.jws,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.xstart,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.rmt,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],atom.header,80*char_size); pos+=80*char_size;
    memcpy(&p2p_buf[pos],&atom.alat,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.efermi,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.vdif,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.ztotss,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],&atom.zcorss,double_size); pos+=double_size;
    memcpy(&p2p_buf[pos],atom.evec,3*double_size); pos+=3*double_size;
    memcpy(&p2p_buf[pos],&atom.nspin,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.numc,int_size); pos+=int_size;

    t=atom.vr.n_row();

    memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.vr(0,0),2*t*double_size); pos+=2*t*double_size;
    memcpy(&p2p_buf[pos],&atom.rhotot(0,0),2*t*double_size); pos+=2*t*double_size;
    memcpy(&p2p_buf[pos],&atom.corden(0,0),2*t*double_size); pos+=2*t*double_size;

    t=atom.ec.n_row();

    memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size;
    memcpy(&p2p_buf[pos],&atom.ec(0,0),2*t*double_size); pos+=2*t*double_size;
    memcpy(&p2p_buf[pos],&atom.nc(0,0),2*t*int_size); pos+=2*t*int_size;
    memcpy(&p2p_buf[pos],&atom.lc(0,0),2*t*int_size); pos+=2*t*int_size;
    memcpy(&p2p_buf[pos],&atom.kc(0,0),2*t*int_size); pos+=2*t*int_size;

    shmem_int_wait_until((sync_send_flag+to),_SHMEM_CMP_EQ,1);
    shmem_putmem(p2p_buf, p2p_buf, 1048576, to);
    shmem_int_add((sync_send_flag+to),-1,comm.comm.rank);
    shmem_int_add((sync_recv_flag+comm.comm.rank),1,to);
    shmem_quiet();

   }// end of false for loop
    
  }
  if(comm.comm.rank==to)
  {
for(i=0;i<ITER_MAX;i++) {
    int pos=0;

    sync_recv_flag[from]=0;
    shmem_int_add((sync_send_flag+comm.comm.rank),1,from);
    shmem_quiet();
    shmem_int_wait_until((sync_recv_flag+from),_SHMEM_CMP_EQ,1);
  
    memcpy(&local_id,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.jmt,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.jws,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.xstart,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.rmt,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(atom.header,&p2p_buf[pos],80*char_size); pos+=80*char_size;
    memcpy(&atom.alat,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.efermi,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.vdif,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.ztotss,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(&atom.zcorss,&p2p_buf[pos],double_size); pos+=double_size;
    memcpy(atom.evec,&p2p_buf[pos],3*double_size); pos+=3*double_size;
    memcpy(&atom.nspin,&p2p_buf[pos],int_size); pos+=int_size;
    memcpy(&atom.numc,&p2p_buf[pos],int_size); pos+=int_size;

    memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size;

    if(t!=atom.vr.n_row()) atom.resizePotential(t);

    memcpy(&atom.vr(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&atom.rhotot(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&atom.corden(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size;

    if(t!=atom.nc.n_row()) atom.resizeCore(t);

    memcpy(&atom.ec(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size;
    memcpy(&atom.nc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size;
    memcpy(&atom.lc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size;
    memcpy(&atom.kc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size;
    shmem_int_add((sync_recv_flag+from),-1,comm.comm.rank);
    shmem_quiet();    
   }
  }

 
}
示例#16
0
int main(int argc, char **argv)
{
  int j;
  int my_pe,n_pes;
  int *flag,*one;
  size_t max_elements,max_elements_bytes;

  char *srce_char,*targ_char;
  short *srce_short,*targ_short;
  int *srce_int,*targ_int;
  long *srce_long,*targ_long;

  start_pes(0);
  my_pe = shmem_my_pe();
  n_pes = shmem_n_pes();
  flag = shmalloc((size_t) sizeof(int));
  one  = shmalloc((size_t) sizeof(int));
  *one  = 1;

/*  fail if trying to use odd number of processors  */
  if ( (n_pes % 2) != 0 ){
        fprintf(stderr, "FAIL - test requires even number of PEs\n");
        exit(1);
  }

  if(my_pe == 0)
    fprintf(stderr, "shmem_num_put(%s)\n", argv[0]);

/*  shmem_putmem test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(char));
  max_elements_bytes = (size_t) (sizeof(char)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_putmem            max_elements = %d\n",max_elements);
  srce_char = shmalloc(max_elements_bytes);
  targ_char = shmalloc(max_elements_bytes);
  if((srce_char == NULL) || (targ_char == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++) 
      srce_char[j] = (char)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++) 
      targ_char[j] = (char)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_putmem(targ_char,srce_char,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_char[j] != (char)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_char[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_char[j],my_pe+j-1);
  }
  shfree(srce_char);  shfree(targ_char);

/*  shmem_put16 test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(short));
  if(max_elements > 20000) max_elements=20000;
  max_elements_bytes = (size_t) (sizeof(short)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put16             max_elements = %d\n",max_elements);
  srce_short = shmalloc(max_elements_bytes);
  targ_short = shmalloc(max_elements_bytes);
  if((srce_short == NULL) || (targ_short == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++) 
      srce_short[j] = (short)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++) 
      targ_short[j] = (short)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put16(targ_short,srce_short,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_short[j] != (short)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_short[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_short[j],my_pe+j-1);
  }
  shfree(srce_short);  shfree(targ_short);

/*  shmem_put32 test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put32             max_elements = %d\n",max_elements);
  srce_int = shmalloc(max_elements_bytes);
  targ_int = shmalloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_int[j] = (int)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_int[j] = (int)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put32(targ_int,srce_int,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_int[j] != (int)(my_pe+j-1) )
	fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_int[j],my_pe+j-1);
  }
  shfree(srce_int);  shfree(targ_int);
  
/*  shmem_put64 test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put64             max_elements = %d\n",max_elements);
  srce_long = shmalloc(max_elements_bytes);
  targ_long = shmalloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put64(targ_long,srce_long,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shfree(srce_long);  shfree(targ_long);

/*  shmem_put128 test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  if ( (max_elements % 2) != 0)
    max_elements = max_elements-1;
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  max_elements = max_elements/2;
  if(my_pe == 0)
    fprintf(stderr,"shmem_put128            max_elements = %d\n",max_elements);
  srce_long = shmalloc(max_elements_bytes);
  targ_long = shmalloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < 2*max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < 2*max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put128(targ_long,srce_long,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < 2*max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shfree(srce_long);  shfree(targ_long);

#ifdef SHMEM_C_GENERIC_32

/*  shmem_put (GENERIC 32) test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(int));
  max_elements_bytes = (size_t) (sizeof(int)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put (GENERIC 32)  max_elements = %d\n",max_elements);
  srce_int = shmalloc(max_elements_bytes);
  targ_int = shmalloc(max_elements_bytes);
  if((srce_int == NULL) || (targ_int == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_int[j] = (int)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_int[j] = (int)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put(targ_int,srce_int,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_int[j] != (int)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_int[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_int[j],my_pe+j-1);
  }
  shfree(srce_int);  shfree(targ_int);

#else

/*  shmem_put (GENERIC 64) test   */
  *flag = 0;
  max_elements = (size_t) (MAX_SIZE / sizeof(long));
  max_elements_bytes = (size_t) (sizeof(long)*max_elements);
  if(my_pe == 0)
    fprintf(stderr,"shmem_put (GENERIC 64)  max_elements = %d\n",max_elements);
  srce_long = shmalloc(max_elements_bytes);
  targ_long = shmalloc(max_elements_bytes);
  if((srce_long == NULL) || (targ_long == NULL))
    shmalloc_error();
  if ( (my_pe % 2) == 0 )
    for(j = 0; j < max_elements; j++)
      srce_long[j] = (long)(my_pe+j);
  else
    for(j = 0; j < max_elements; j++)
      targ_long[j] = (long)(my_pe+j);
  shmem_barrier_all();
  if ( (my_pe % 2) == 0 ) {
    shmem_put(targ_long,srce_long,max_elements,my_pe+1);
    shmem_quiet();
    shmem_int_put(flag,one,(size_t)1,my_pe+1);
  } else {
    shmem_int_wait(flag,0);
    for(j = 0; j < max_elements; j++)
      if ( targ_long[j] != (long)(my_pe+j-1) )
        fprintf(stderr, "FAIL: PE [%d] targ_long[%d]=%d my_pe+j-1=%d\n",
                               my_pe,j,targ_long[j],my_pe+j-1);
  }
  shfree(srce_long);  shfree(targ_long);

#endif

#ifdef NEEDS_FINALIZE
  shmem_finalize(); 
#endif
  return 0;
}
示例#17
0
void
FORTRANIFY (shmem_putmem) (void *target, const void *source, int *size, int *pe)
{
    shmem_putmem (target, source, *size, *pe);
}
示例#18
0
文件: stencil.c 项目: kempj/Kernels
int main(int argc, char ** argv) {
 
  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx, Num_procsy; /* number of ranks in each coord direction      */
  int    my_ID;           /* SHMEM rank                                          */
  int    my_IDx, my_IDy;  /* coordinates of rank in rank grid                    */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in[2];   /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in[2];/*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in[2]; /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in[2];  /*       "         "                                   */
  int    root = 0;
  int    n, width, height;/* linear global and local grid dimension              */
  int    i, j, ii, jj, kk, it, jt, iter, leftover;  /* dummies                   */
  int    istart, iend;    /* bounds of grid tile assigned to calling rank        */
  int    jstart, jend;    /* bounds of grid tile assigned to calling rank        */
  DTYPE  reference_norm;
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  int    stencil_size;    /* number of points in the stencil                     */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double avgtime,         /* timing parameters                                   */
         *local_stencil_time, *stencil_time; 
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  int    *arguments;      /* command line parameters                             */
  int    count_case=4;    /* number of neighbors of a rank                       */
  long   *pSync_bcast;    /* work space for collectives                          */
  long   *pSync_reduce;   /* work space for collectives                          */
  double *pWrk_time;      /* work space for collectives                          */
  DTYPE  *pWrk_norm;      /* work space for collectives                          */
  int    *iterflag;       /* synchronization flags                               */
  int    sw;              /* double buffering switch                             */
  DTYPE  *local_norm, *norm; /* local and global error norms                     */

  /*******************************************************************************
  ** Initialize the SHMEM environment
  ********************************************************************************/
  prk_shmem_init();

  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  pSync_bcast        = (long *)   prk_shmem_malloc(PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce       = (long *)   prk_shmem_malloc(PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk_time          = (double *) prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(double));
  pWrk_norm          = (DTYPE *)  prk_shmem_malloc(PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE*sizeof(DTYPE));
  local_stencil_time = (double *) prk_shmem_malloc(sizeof(double));
  stencil_time       = (double *) prk_shmem_malloc(sizeof(double));
  local_norm         = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  norm               = (DTYPE *)  prk_shmem_malloc(sizeof(DTYPE));
  iterflag           = (int *)    prk_shmem_malloc(2*sizeof(int));
  if (!(pSync_bcast && pSync_reduce && pWrk_time && pWrk_norm && iterflag &&
	local_stencil_time && stencil_time && local_norm && norm))
  {
    printf("Could not allocate scalar variables on rank %d\n", my_ID);
    error = 1;
  }
  bail_out(error);

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

  arguments=(int*)prk_shmem_malloc(2*sizeof(int));
 
  /*******************************************************************************
  ** process, test, and broadcast input parameters    
  ********************************************************************************/
 
  if (my_ID == root) {
#ifndef STAR
    printf("ERROR: Compact stencil not supported\n");
    error = 1;
    goto ENDOFTESTS;
#endif
      
    if (argc != 3){
      printf("Usage: %s <# iterations> <array dimension> \n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }
 
    iterations  = atoi(*++argv); 
    arguments[0]=iterations;

    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    n  = atoi(*++argv);
    arguments[1]=n;
    long nsquare = (long)n * (long)n;

    if (nsquare < Num_procs){ 
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }
 
    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;  
    }
 
    ENDOFTESTS:;  
  }
  bail_out(error);
 
  /* determine best way to create a 2D grid of ranks (closest to square, for 
     best surface/volume ratio); we do this brute force for now
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      break;
    }
  }      
  my_IDx = my_ID%Num_procsx;
  my_IDy = my_ID/Num_procsx;
  /* compute neighbors; don't worry about dropping off the edges of the grid */
  right_nbr  = my_ID+1;
  left_nbr   = my_ID-1;
  top_nbr    = my_ID+Num_procsx;
  bottom_nbr = my_ID-Num_procsx;

  iterflag[0] = iterflag[1] = 0;

  if(my_IDx==0)            count_case--;
  if(my_IDx==Num_procsx-1) count_case--;
  if(my_IDy==0)            count_case--;
  if(my_IDy==Num_procsy-1) count_case--;
 
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM stencil execution on 2D grid\n");
    printf("Number of ranks        = %d\n", Num_procs);
    printf("Grid size              = %d\n", n);
    printf("Radius of stencil      = %d\n", RADIUS);
    printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy);
    printf("Type of stencil        = star\n");
#ifdef DOUBLE
    printf("Data type              = double precision\n");
#else
    printf("Data type              = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
#if SPLITFENCE
    printf("Split fence            = ON\n");
#else
    printf("Split fence            = OFF\n");
#endif
    printf("Number of iterations   = %d\n", iterations);
  }

  shmem_barrier_all();
 
  shmem_broadcast32(&arguments[0], &arguments[0], 2, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  n=arguments[1];

  shmem_barrier_all();
  prk_shmem_free(arguments);
 
  /* compute amount of space required for input and solution arrays             */
  
  width = n/Num_procsx;
  leftover = n%Num_procsx;
  if (my_IDx<leftover) {
    istart = (width+1) * my_IDx; 
    iend = istart + width + 1;
  }
  else {
    istart = (width+1) * leftover + width * (my_IDx-leftover);
    iend = istart + width;
  }
  
  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  height = n/Num_procsy;
  leftover = n%Num_procsy;
  if (my_IDy<leftover) {
    jstart = (height+1) * my_IDy; 
    jend = jstart + height + 1;
  }
  else {
    jstart = (height+1) * leftover + height * (my_IDy-leftover);
    jend = jstart + height;
  }
  
  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);
 
  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius\n",
           my_ID);
    error = 1;
  }
  bail_out(error);
 
  total_length_in = (width+2*RADIUS);
  total_length_in *= (height+2*RADIUS);
  total_length_in *= sizeof(DTYPE);

  total_length_out = width;
  total_length_out *= height;
  total_length_out *= sizeof(DTYPE);
 
  in  = (DTYPE *) malloc(total_length_in);
  out = (DTYPE *) malloc(total_length_out);
  if (!in || !out) {
    printf("ERROR: rank %d could not allocate space for input/output array\n",
            my_ID);
    error = 1;
  }
  bail_out(error);
 
  /* fill the stencil weights to reflect a discrete divergence operator         */
  for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;

  for (ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }
 
  norm[0] = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);

  /* intialize the input and output arrays                                     */
  for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* allocate communication buffers for halo values                            */
  top_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*width);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  bottom_buf_out = top_buf_out+RADIUS*width;

  top_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*width);
  if(!top_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for y-direction\n", my_ID);
    error=1;
  }
  bail_out(error);
  top_buf_in[1]    = top_buf_in[0]    + RADIUS*width;
  bottom_buf_in[0] = top_buf_in[1]    + RADIUS*width;
  bottom_buf_in[1] = bottom_buf_in[0] + RADIUS*width;
 
  right_buf_out=(DTYPE*)malloc(2*sizeof(DTYPE)*RADIUS*height);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocate output comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  left_buf_out=right_buf_out+RADIUS*height;

  right_buf_in[0]=(DTYPE*)prk_shmem_malloc(4*sizeof(DTYPE)*RADIUS*height);
  if(!right_buf_in)
  {
    printf("ERROR: Rank %d could not allocate input comm buffers for x-dimension\n", my_ID);
    error=1;
  }
  bail_out(error);
  right_buf_in[1] = right_buf_in[0] + RADIUS*height;
  left_buf_in[0]  = right_buf_in[1] + RADIUS*height;
  left_buf_in[1]  = left_buf_in[0]  + RADIUS*height;

  /* make sure all symmetric heaps are allocated before being used  */
  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      shmem_barrier_all();
      local_stencil_time[0] = wtime();
    }
    /* sw determines which incoming buffer to select */
    sw = iter%2;

    /* need to fetch ghost point data from neighbors */

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend-RADIUS; j<=jend-1; j++) for (i=istart; i<=iend; i++) {
          top_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(bottom_buf_in[sw], top_buf_out, RADIUS*width*sizeof(DTYPE), top_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], top_nbr);
#endif
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          bottom_buf_out[kk++]= IN(i,j);
      }
      shmem_putmem(top_buf_in[sw], bottom_buf_out, RADIUS*width*sizeof(DTYPE), bottom_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], bottom_nbr);
#endif
    }

    if(my_IDx < Num_procsx-1) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=iend-RADIUS;i<=iend-1;i++) {
	right_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(left_buf_in[sw], right_buf_out, RADIUS*height*sizeof(DTYPE), right_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], right_nbr);
#endif
    }

    if(my_IDx>0) {
      for(kk=0,j=jstart;j<=jend;j++) for(i=istart;i<=istart+RADIUS-1;i++) {
	left_buf_out[kk++]=IN(i,j);
      }
      shmem_putmem(right_buf_in[sw], left_buf_out, RADIUS*height*sizeof(DTYPE), left_nbr);
#if SPLITFENCE
      shmem_fence();
      shmem_int_inc(&iterflag[sw], left_nbr);
#endif
    }

#if SPLITFENCE == 0
    shmem_fence();
    if(my_IDy<Num_procsy-1) shmem_int_inc(&iterflag[sw], top_nbr);
    if(my_IDy>0)            shmem_int_inc(&iterflag[sw], bottom_nbr);
    if(my_IDx<Num_procsx-1) shmem_int_inc(&iterflag[sw], right_nbr);
    if(my_IDx>0)            shmem_int_inc(&iterflag[sw], left_nbr);
#endif

    shmem_int_wait_until(&iterflag[sw], SHMEM_CMP_EQ, count_case*(iter/2+1));

    if (my_IDy < Num_procsy-1) {
      for (kk=0,j=jend; j<=jend+RADIUS-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = top_buf_in[sw][kk++];
      }      
    }
    if (my_IDy > 0) {
      for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) {
          IN(i,j) = bottom_buf_in[sw][kk++];
      }      
    }

    if (my_IDx < Num_procsx-1) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=iend; i<=iend+RADIUS-1; i++) {
          IN(i,j) = right_buf_in[sw][kk++];
      }      
    }
    if (my_IDx > 0) {
      for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) {
          IN(i,j) = left_buf_in[sw][kk++];
      }      
    }
 
    /* Apply the stencil operator */
    for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) {
      for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }
 
    /* add constant to solution to force refresh of neighbor data, if any */
    for (j=jstart; j<jend; j++) for (i=istart; i<iend; i++) IN(i,j)+= 1.0;
 
  }
 
  local_stencil_time[0] = wtime() - local_stencil_time[0];

  shmem_barrier_all();

  shmem_double_max_to_all(&stencil_time[0], &local_stencil_time[0], 1, 0, 0,
                          Num_procs, pWrk_time, pSync_reduce);
  
  /* compute L1 norm in parallel                                                */
  local_norm[0] = (DTYPE) 0.0;
  for (j=MAX(jstart,RADIUS); j<MIN(n-RADIUS,jend); j++) {
    for (i=MAX(istart,RADIUS); i<MIN(n-RADIUS,iend); i++) {
      local_norm[0] += (DTYPE)ABS(OUT(i,j));
    }
  }

  shmem_barrier_all();
 
#ifdef DOUBLE
  shmem_double_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#else
  shmem_float_sum_to_all(&norm[0], &local_norm[0], 1, 0, 0, Num_procs, pWrk_norm, pSync_reduce);
#endif
 
  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/
 
/* verify correctness                                                            */
  if (my_ID == root) {
    norm[0] /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm[0]-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm[0], reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#ifdef VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", 
             reference_norm, norm[0]);
#endif
    }
  }
  bail_out(error);
 
  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil, 
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time[0]/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }
 

  prk_shmem_free(top_buf_in);
  prk_shmem_free(right_buf_in);
  free(top_buf_out);
  free(right_buf_out);

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk_time);
  prk_shmem_free(pWrk_norm);

  prk_shmem_finalize();

  exit(EXIT_SUCCESS);
}
示例#19
0
void shmemx_putmem_ct(shmemx_ct_t ct, void *target, const void *source, size_t len, int pe)
{
    shmem_putmem(target, source, len, pe);
    oshmpi_remote_sync_pe(pe);
    shmem_long_add(ct, 1, pe);
}