/// The force exchange is considerably simpler than the atom exchange. /// In the force case we only need to exchange data that is needed to /// complete the force calculation. Since the atoms have not moved we /// only need to send data from local link cells and we are guaranteed /// that the same atoms exist in the same order in corresponding halo /// cells on remote tasks. The only tricky part is the size of the /// plane of local cells that needs to be sent grows in each direction. /// This is because the y-axis send must send some of the data that was /// received from the x-axis send, and the z-axis must send some data /// from the y-axis send. This accumulation of data to send is /// responsible for data reaching neighbor cells that share only edges /// or corners. /// /// \see eam.c for an explanation of the requirement to exchange /// force data. HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes) { HaloExchange* hh = initHaloExchange(domain); hh->loadBuffer = loadForceBuffer; hh->unloadBuffer = unloadForceBuffer; hh->destroy = destroyForceExchange; int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg); ForceExchangeParms* parms = comdMalloc(sizeof(ForceExchangeParms)); parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1] )*(boxes->gridSize[2] ); parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2] ); parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]); parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]); } hh->parms = parms; return hh; }
/// The force exchange is considerably simpler than the atom exchange. /// In the force case we only need to exchange data that is needed to /// complete the force calculation. Since the atoms have not moved we /// only need to send data from local link cells and we are guaranteed /// that the same atoms exist in the same order in corresponding halo /// cells on remote tasks. The only tricky part is the size of the /// plane of local cells that needs to be sent grows in each direction. /// This is because the y-axis send must send some of the data that was /// received from the x-axis send, and the z-axis must send some data /// from the y-axis send. This accumulation of data to send is /// responsible for data reaching neighbor cells that share only edges /// or corners. /// /// \see eam.c for an explanation of the requirement to exchange /// force data. HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes, int useGPU) { HaloExchange* hh = initHaloExchange(domain); if(useGPU){ hh->loadBuffer = loadForceBuffer; hh->unloadBuffer = unloadForceBuffer; }else{ hh->loadBuffer = loadForceBufferCpu; hh->unloadBuffer = unloadForceBufferCpu; } hh->destroy = destroyForceExchange; int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg); hh->sendBufM = (char*)comdMalloc(hh->bufCapacity); hh->sendBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufM = (char*)comdMalloc(hh->bufCapacity); // pin memory cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0); cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0); ForceExchangeParms* parms = (ForceExchangeParms*)comdMalloc(sizeof(ForceExchangeParms)); parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1] )*(boxes->gridSize[2] ); parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2] ); parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]); parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]); // copy cell list to gpu cudaMalloc((void**)&parms->sendCellsGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMalloc((void**)&parms->recvCellsGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMemcpy(parms->sendCellsGpu[ii], parms->sendCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(parms->recvCellsGpu[ii], parms->recvCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); // allocate temp buf int size = parms->nCells[ii]+1; if (size % 256 != 0) size = ((size + 255)/256)*256; cudaMalloc((void**)&parms->natoms_buf[ii], size * sizeof(int)); cudaMalloc((void**)&parms->partial_sums[ii], (size/256 + 1) * sizeof(int)); } hh->hashTable = NULL; hh->type = 1; hh->parms = parms; return hh; }