void CommKokkos::reverse_comm_device() { int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; double *buf; // exchange data with another proc // if other proc is self, just copy // if comm_f_only set, exchange or copy directly from f, don't pack k_sendlist.sync<DeviceType>(); atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK); for (int iswap = nswap-1; iswap >= 0; iswap--) { if (sendproc[iswap] != me) { if (comm_f_only) { if (size_reverse_recv[iswap]) MPI_Irecv(k_buf_recv.view<DeviceType>().data(),size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); if (size_reverse_send[iswap]) { buf = atomKK->k_f.view<DeviceType>().data() + firstrecv[iswap]*atomKK->k_f.view<DeviceType>().extent(1); MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE, recvproc[iswap],0,world); } if (size_reverse_recv[iswap]) { MPI_Wait(&request,MPI_STATUS_IGNORE); atomKK->modified(ExecutionSpaceFromDevice<DeviceType>:: space,F_MASK); } } else { if (size_reverse_recv[iswap]) MPI_Irecv(k_buf_recv.view<DeviceType>().data(), size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); DeviceType::fence(); if (n) MPI_Send(k_buf_send.view<DeviceType>().data(),n, MPI_DOUBLE,recvproc[iswap],0,world); if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); } avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_recv); DeviceType::fence(); } else { if (sendnum[iswap]) n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap, firstrecv[iswap]); } } }
void CommKokkos::borders_device() { int i,n,itype,iswap,dim,ineed,twoneed,smax,rmax; int nsend,nrecv,sendflag,nfirst,nlast,ngroup; double lo,hi; int *type; double **x; double *buf,*mlo,*mhi; MPI_Request request; MPI_Status status; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space; k_sendlist.modify<DeviceType>(); atomKK->sync(exec_space,ALL_MASK); // do swaps over all 3 dimensions iswap = 0; smax = rmax = 0; for (dim = 0; dim < 3; dim++) { nlast = 0; twoneed = 2*maxneed[dim]; for (ineed = 0; ineed < twoneed; ineed++) { // find atoms within slab boundaries lo/hi using <= and >= // check atoms between nfirst and nlast // for first swaps in a dim, check owned and ghost // for later swaps in a dim, only check newly arrived ghosts // store sent atom indices in list for use in future timesteps x = atom->x; if (style == SINGLE) { lo = slablo[iswap]; hi = slabhi[iswap]; } else { type = atom->type; mlo = multilo[iswap]; mhi = multihi[iswap]; } if (ineed % 2 == 0) { nfirst = nlast; nlast = atom->nlocal + atom->nghost; } nsend = 0; // sendflag = 0 if I do not send on this swap // sendneed test indicates receiver no longer requires data // e.g. due to non-PBC or non-uniform sub-domains if (ineed/2 >= sendneed[dim][ineed % 2]) sendflag = 0; else sendflag = 1; // find send atoms according to SINGLE vs MULTI // all atoms eligible versus atoms in bordergroup // only need to limit loop to bordergroup for first sends (ineed < 2) // on these sends, break loop in two: owned (in group) and ghost if (sendflag) { if (!bordergroup || ineed >= 2) { if (style == SINGLE) { typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1); total_send.h_view(0) = 0; if(exec_space == Device) { total_send.template modify<DeviceType>(); total_send.template sync<LMPDeviceType>(); } BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist, total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::ParallelWorkRequest config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); DeviceType::fence(); total_send.template modify<DeviceType>(); total_send.template sync<LMPHostType>(); if(total_send.h_view(0) >= maxsendlist[iswap]) { grow_list(iswap,total_send.h_view(0)); total_send.h_view(0) = 0; if(exec_space == Device) { total_send.template modify<LMPHostType>(); total_send.template sync<LMPDeviceType>(); } BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist, total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]); Kokkos::ParallelWorkRequest config((nlast-nfirst+127)/128,128); Kokkos::parallel_for(config,f); DeviceType::fence(); total_send.template modify<DeviceType>(); total_send.template sync<LMPHostType>(); } nsend = total_send.h_view(0); } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos\n"); for (i = nfirst; i < nlast; i++) { itype = type[i]; if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } } } else { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos\n"); if (style == SINGLE) { ngroup = atom->nfirst; for (i = 0; i < ngroup; i++) if (x[i][dim] >= lo && x[i][dim] <= hi) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } for (i = atom->nlocal; i < nlast; i++) if (x[i][dim] >= lo && x[i][dim] <= hi) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } else { ngroup = atom->nfirst; for (i = 0; i < ngroup; i++) { itype = type[i]; if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } for (i = atom->nlocal; i < nlast; i++) { itype = type[i]; if (x[i][dim] >= mlo[itype] && x[i][dim] <= mhi[itype]) { if (nsend == maxsendlist[iswap]) grow_list(iswap,nsend); sendlist[iswap][nsend++] = i; } } } } } // pack up list of border atoms if (nsend*size_border > maxsend) grow_send_kokkos(nsend*size_border,0); if (ghost_velocity) { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos\n"); n = avec->pack_border_vel(nsend,sendlist[iswap],buf_send, pbc_flag[iswap],pbc[iswap]); } else n = avec-> pack_border_kokkos(nsend,k_sendlist,k_buf_send,iswap, pbc_flag[iswap],pbc[iswap],exec_space); // swap atoms with other proc // no MPI calls except SendRecv if nsend/nrecv = 0 // put incoming ghosts at end of my atom arrays // if swapping with self, simply copy, no messages if (sendproc[iswap] != me) { MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0, &nrecv,1,MPI_INT,recvproc[iswap],0,world,&status); if (nrecv*size_border > maxrecv) grow_recv_kokkos(nrecv*size_border); if (nrecv) MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(), nrecv*size_border,MPI_DOUBLE, recvproc[iswap],0,world,&request); if (n) MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n, MPI_DOUBLE,sendproc[iswap],0,world); if (nrecv) MPI_Wait(&request,&status); buf = buf_recv; } else { nrecv = nsend; buf = buf_send; } // unpack buffer if (ghost_velocity) { error->all(FLERR,"Required border comm not yet " "implemented with Kokkos\n"); avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf); } else if (sendproc[iswap] != me) avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, k_buf_recv,exec_space); else avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, k_buf_send,exec_space); // set all pointers & counters smax = MAX(smax,nsend); rmax = MAX(rmax,nrecv); sendnum[iswap] = nsend; recvnum[iswap] = nrecv; size_forward_recv[iswap] = nrecv*size_forward; size_reverse_send[iswap] = nrecv*size_reverse; size_reverse_recv[iswap] = nsend*size_reverse; firstrecv[iswap] = atom->nlocal + atom->nghost; atom->nghost += nrecv; iswap++; } } // insure send/recv buffers are long enough for all forward & reverse comm int max = MAX(maxforward*smax,maxreverse*rmax); if (max > maxsend) grow_send_kokkos(max,0); max = MAX(maxforward*rmax,maxreverse*smax); if (max > maxrecv) grow_recv_kokkos(max); // reset global->local map if (map_style) atom->map_set(); if (exec_space == Host) k_sendlist.sync<LMPDeviceType>(); atomKK->modified(exec_space,ALL_MASK); DeviceType::fence(); }
void CommKokkos::exchange_device() { int i,m,nsend,nrecv,nrecv1,nrecv2,nlocal; double lo,hi,value; double **x; double *sublo,*subhi,*buf; MPI_Request request; MPI_Status status; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; // clear global->local map for owned and ghost atoms // b/c atoms migrate to new procs in exchange() and // new ghosts are created in borders() // map_set() is done at end of borders() // clear ghost count and any ghost bonus data internal to AtomVec if (map_style) atom->map_clear(); atom->nghost = 0; atom->avec->clear_bonus(); // subbox bounds for orthogonal or triclinic if (triclinic == 0) { sublo = domain->sublo; subhi = domain->subhi; } else { sublo = domain->sublo_lamda; subhi = domain->subhi_lamda; } atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK); // loop over dimensions for (int dim = 0; dim < 3; dim++) { // fill buffer with atoms leaving my box, using < and >= // when atom is deleted, fill it in with last atom x = atom->x; lo = sublo[dim]; hi = subhi[dim]; nlocal = atom->nlocal; i = nsend = 0; if (true) { if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal); k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0(); while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { k_count.h_view(0) = 0; k_count.modify<LMPHostType>(); k_count.sync<DeviceType>(); BuildExchangeListFunctor<DeviceType> f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag, nlocal,dim,lo,hi); Kokkos::parallel_for(nlocal,f); DeviceType::fence(); k_exchange_sendlist.modify<DeviceType>(); k_sendflag.modify<DeviceType>(); k_count.modify<DeviceType>(); k_count.sync<LMPHostType>(); if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) { k_exchange_sendlist.resize(k_count.h_view(0)*1.1); k_exchange_copylist.resize(k_count.h_view(0)*1.1); k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0(); } } k_exchange_sendlist.sync<LMPHostType>(); k_sendflag.sync<LMPHostType>(); int sendpos = nlocal-1; nlocal -= k_count.h_view(0); for(int i = 0; i < k_count.h_view(0); i++) { if (k_exchange_sendlist.h_view(i)<nlocal) { while (k_sendflag.h_view(sendpos)) sendpos--; k_exchange_copylist.h_view(i) = sendpos; sendpos--; } else k_exchange_copylist.h_view(i) = -1; } k_exchange_copylist.modify<LMPHostType>(); k_exchange_copylist.sync<DeviceType>(); nsend = avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send, k_exchange_sendlist,k_exchange_copylist, ExecutionSpaceFromDevice<DeviceType>:: space,dim,lo,hi); DeviceType::fence(); } else { while (i < nlocal) { if (x[i][dim] < lo || x[i][dim] >= hi) { if (nsend > maxsend) grow_send_kokkos(nsend,1); nsend += avec->pack_exchange(i,&buf_send[nsend]); avec->copy(nlocal-1,i,1); nlocal--; } else i++; } } atom->nlocal = nlocal; // send/recv atoms in both directions // if 1 proc in dimension, no send/recv, set recv buf to send buf // if 2 procs in dimension, single send/recv // if more than 2 procs in dimension, send/recv to both neighbors if (procgrid[dim] == 1) { nrecv = nsend; buf = buf_send; if (nrecv) { atom->nlocal=avec-> unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi, ExecutionSpaceFromDevice<DeviceType>::space); DeviceType::fence(); } } else { MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0, &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,&status); nrecv = nrecv1; if (procgrid[dim] > 2) { MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0, &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,&status); nrecv += nrecv2; } if (nrecv > maxrecv) grow_recv_kokkos(nrecv); MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),nrecv1, MPI_DOUBLE,procneigh[dim][1],0, world,&request); MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),nsend, MPI_DOUBLE,procneigh[dim][0],0,world); MPI_Wait(&request,&status); if (procgrid[dim] > 2) { MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device()+nrecv1, nrecv2,MPI_DOUBLE,procneigh[dim][0],0, world,&request); MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),nsend, MPI_DOUBLE,procneigh[dim][1],0,world); MPI_Wait(&request,&status); } buf = buf_recv; if (nrecv) { atom->nlocal = avec-> unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi, ExecutionSpaceFromDevice<DeviceType>::space); DeviceType::fence(); } } // check incoming atoms to see if they are in my box // if so, add to my list } atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK); if (atom->firstgroupname) { /* this is not yet implemented with Kokkos */ atomKK->sync(Host,ALL_MASK); atom->first_reorder(); atomKK->modified(Host,ALL_MASK); } }
void CommKokkos::forward_comm_device(int dummy) { int n; MPI_Request request; MPI_Status status; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; double **x = atom->x; double *buf; // exchange data with another proc // if other proc is self, just copy // if comm_x_only set, exchange or copy directly to x, don't unpack k_sendlist.sync<DeviceType>(); for (int iswap = 0; iswap < nswap; iswap++) { if (sendproc[iswap] != me) { if (comm_x_only) { atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK); if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]]; else buf = NULL; if (size_forward_recv[iswap]) { buf = atomKK->k_x.view<DeviceType>().ptr_on_device() + firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1(); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); if (n) { MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(), n,MPI_DOUBLE,sendproc[iswap],0,world); } if (size_forward_recv[iswap]) MPI_Wait(&request,&status); atomKK->modified(ExecutionSpaceFromDevice<DeviceType>:: space,X_MASK); } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); if (size_forward_recv[iswap]) MPI_Irecv(k_buf_recv.view<LMPHostType>().ptr_on_device(), size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); if (n) MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world); if (size_forward_recv[iswap]) MPI_Wait(&request,&status); avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv); } else { if (size_forward_recv[iswap]) MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(), size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_send,pbc_flag[iswap],pbc[iswap]); if (n) MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n, MPI_DOUBLE,sendproc[iswap],0,world); if (size_forward_recv[iswap]) MPI_Wait(&request,&status); avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); } } else { if (!ghost_velocity) { if (sendnum[iswap]) n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap, firstrecv[iswap],pbc_flag[iswap],pbc[iswap]); } else if (ghost_velocity) { error->all(FLERR,"Ghost velocity forward comm not yet " "implemented with Kokkos"); n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap], buf_send,pbc_flag[iswap],pbc[iswap]); avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send); } } } }
void CommKokkos::forward_comm_device(int dummy) { int n; MPI_Request request; AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec; double *buf; // exchange data with another proc // if other proc is self, just copy // if comm_x_only set, exchange or copy directly to x, don't unpack k_sendlist.sync<DeviceType>(); atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK); for (int iswap = 0; iswap < nswap; iswap++) { if (sendproc[iswap] != me) { if (comm_x_only) { if (size_forward_recv[iswap]) { buf = atomKK->k_x.view<DeviceType>().data() + firstrecv[iswap]*atomKK->k_x.view<DeviceType>().extent(1); MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); DeviceType::fence(); if (n) { MPI_Send(k_buf_send.view<DeviceType>().data(), n,MPI_DOUBLE,sendproc[iswap],0,world); } if (size_forward_recv[iswap]) { MPI_Wait(&request,MPI_STATUS_IGNORE); atomKK->modified(ExecutionSpaceFromDevice<DeviceType>:: space,X_MASK); } } else if (ghost_velocity) { if (size_forward_recv[iswap]) { MPI_Irecv(k_buf_recv.view<DeviceType>().data(), size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); } n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_send,pbc_flag[iswap],pbc[iswap]); DeviceType::fence(); if (n) { MPI_Send(k_buf_send.view<DeviceType>().data(),n, MPI_DOUBLE,sendproc[iswap],0,world); } if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); DeviceType::fence(); } else { if (size_forward_recv[iswap]) MPI_Irecv(k_buf_recv.view<DeviceType>().data(), size_forward_recv[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_send,pbc_flag[iswap],pbc[iswap]); DeviceType::fence(); if (n) MPI_Send(k_buf_send.view<DeviceType>().data(),n, MPI_DOUBLE,sendproc[iswap],0,world); if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); DeviceType::fence(); } } else { if (!ghost_velocity) { if (sendnum[iswap]) n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap, firstrecv[iswap],pbc_flag[iswap],pbc[iswap]); DeviceType::fence(); } else { n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_send,pbc_flag[iswap],pbc[iswap]); DeviceType::fence(); avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); DeviceType::fence(); } } } }