static inline void atomic_add(int me, int iterations, int T) { int i; if (me == 0) pre_op_check(__func__, target[T], iterations, 0); target[T] = 0; shmem_barrier_all(); if (me == 1) { for (i = 0; i < iterations; i++) { shmem_int_add(&target[T], 1, 0); shmem_fence(); } shmem_int_add(&target[T], 1, 0); if (debug) printf("PE 1 done with operation\n"); } else wait_until(&target[T], (iterations+1), 0); if (verbose) { if (me == 1) printf("SHMEM %s finished\n", __func__); } }
double benchmark_add (struct pe_vars v, union data_types *buffer, unsigned long iterations) { int64_t begin, end; int i; static double rate = 0, sum_rate = 0, lat = 0, sum_lat = 0; /* * Touch memory */ memset(buffer, CHAR_MAX * drand48(), sizeof(union data_types [ITERATIONS])); shmem_barrier_all(); if (v.me < v.pairs) { int value = INT_MAX * drand48(); begin = TIME(); for (i = 0; i < iterations; i++) { shmem_int_add(&(buffer[i].int_type), value, v.nxtpe); } end = TIME(); rate = ((double)iterations * 1e6) / (end - begin); lat = (end - begin) / (double)iterations; } shmem_double_sum_to_all(&sum_rate, &rate, 1, 0, 0, v.npes, pwrk1, psync1); shmem_double_sum_to_all(&sum_lat, &lat, 1, 0, 0, v.npes, pwrk2, psync2); print_operation_rate(v.me, "shmem_int_add", sum_rate/1e6, sum_lat/v.pairs); return 0; }
int main (int argc, char *argv[]) { int me; start_pes (0); me = _my_pe (); if (me > 0) { shmem_int_add (&counter, me, 0); } shmem_barrier_all (); if (me == 0) { printf ("counter = %d\n", counter); } return 0; }
void FORTRANIFY (shmem_int4_add) (int *target, int *value, int *pe) { shmem_int_add (target, *value, *pe); }
void communicateSingleAtomData(LSMSCommunication &comm, int from, int to, int &local_id, AtomData &atom, int tag) { //The buffers used in this func are pre-allocated within initializeCommunication() of size 's' below //int s=sizeof(AtomData)+sizeof(Real)*(2*3*MAXPTS+2*MAXCORE)+sizeof(int)*3*2*MAXCORE+sizeof(int); // 304 bytes transferred in each of the ITER_MAX iterations const int maxPts=MAXPTS; const int maxCore=MAXCORE; int t,i; static int count=0; const int ITER_MAX=1; int sec_id; if(comm.comm.rank==from) { for (i=0;i<ITER_MAX;i++){ int pos=0; memcpy(&p2p_buf[pos],&local_id,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.jmt,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.jws,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.xstart,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.rmt,double_size); pos+=double_size; memcpy(&p2p_buf[pos],atom.header,80*char_size); pos+=80*char_size; memcpy(&p2p_buf[pos],&atom.alat,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.efermi,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.vdif,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.ztotss,double_size); pos+=double_size; memcpy(&p2p_buf[pos],&atom.zcorss,double_size); pos+=double_size; memcpy(&p2p_buf[pos],atom.evec,3*double_size); pos+=3*double_size; memcpy(&p2p_buf[pos],&atom.nspin,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.numc,int_size); pos+=int_size; t=atom.vr.n_row(); memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.vr(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.rhotot(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.corden(0,0),2*t*double_size); pos+=2*t*double_size; t=atom.ec.n_row(); memcpy(&p2p_buf[pos],&t,int_size); pos+=int_size; memcpy(&p2p_buf[pos],&atom.ec(0,0),2*t*double_size); pos+=2*t*double_size; memcpy(&p2p_buf[pos],&atom.nc(0,0),2*t*int_size); pos+=2*t*int_size; memcpy(&p2p_buf[pos],&atom.lc(0,0),2*t*int_size); pos+=2*t*int_size; memcpy(&p2p_buf[pos],&atom.kc(0,0),2*t*int_size); pos+=2*t*int_size; shmem_int_wait_until((sync_send_flag+to),_SHMEM_CMP_EQ,1); shmem_putmem(p2p_buf, p2p_buf, 1048576, to); shmem_int_add((sync_send_flag+to),-1,comm.comm.rank); shmem_int_add((sync_recv_flag+comm.comm.rank),1,to); shmem_quiet(); }// end of false for loop } if(comm.comm.rank==to) { for(i=0;i<ITER_MAX;i++) { int pos=0; sync_recv_flag[from]=0; shmem_int_add((sync_send_flag+comm.comm.rank),1,from); shmem_quiet(); shmem_int_wait_until((sync_recv_flag+from),_SHMEM_CMP_EQ,1); memcpy(&local_id,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.jmt,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.jws,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.xstart,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.rmt,&p2p_buf[pos],double_size); pos+=double_size; memcpy(atom.header,&p2p_buf[pos],80*char_size); pos+=80*char_size; memcpy(&atom.alat,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.efermi,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.vdif,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.ztotss,&p2p_buf[pos],double_size); pos+=double_size; memcpy(&atom.zcorss,&p2p_buf[pos],double_size); pos+=double_size; memcpy(atom.evec,&p2p_buf[pos],3*double_size); pos+=3*double_size; memcpy(&atom.nspin,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&atom.numc,&p2p_buf[pos],int_size); pos+=int_size; memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size; if(t!=atom.vr.n_row()) atom.resizePotential(t); memcpy(&atom.vr(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.rhotot(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.corden(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&t,&p2p_buf[pos],int_size); pos+=int_size; if(t!=atom.nc.n_row()) atom.resizeCore(t); memcpy(&atom.ec(0,0),&p2p_buf[pos],2*t*double_size); pos+=2*t*double_size; memcpy(&atom.nc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; memcpy(&atom.lc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; memcpy(&atom.kc(0,0),&p2p_buf[pos],2*t*int_size); pos+=2*t*int_size; shmem_int_add((sync_recv_flag+from),-1,comm.comm.rank); shmem_quiet(); } } }