/// The force exchange is considerably simpler than the atom exchange. /// In the force case we only need to exchange data that is needed to /// complete the force calculation. Since the atoms have not moved we /// only need to send data from local link cells and we are guaranteed /// that the same atoms exist in the same order in corresponding halo /// cells on remote tasks. The only tricky part is the size of the /// plane of local cells that needs to be sent grows in each direction. /// This is because the y-axis send must send some of the data that was /// received from the x-axis send, and the z-axis must send some data /// from the y-axis send. This accumulation of data to send is /// responsible for data reaching neighbor cells that share only edges /// or corners. /// /// \see eam.c for an explanation of the requirement to exchange /// force data. HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes) { HaloExchange* hh = initHaloExchange(domain); hh->loadBuffer = loadForceBuffer; hh->unloadBuffer = unloadForceBuffer; hh->destroy = destroyForceExchange; int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg); ForceExchangeParms* parms = comdMalloc(sizeof(ForceExchangeParms)); parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1] )*(boxes->gridSize[2] ); parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2] ); parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]); parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]); } hh->parms = parms; return hh; }
/// \details /// When called in proper sequence by redistributeAtoms, the atom halo /// exchange helps serve three purposes: /// - Send ghost atom data to neighbor tasks. /// - Shift atom coordinates by the global simulation size when they cross /// periodic boundaries. This shift is performed in loadAtomsBuffer. /// - Transfer ownership of atoms between tasks as the atoms move across /// spatial domain boundaries. This transfer of ownership occurs in /// two places. The former owner gives up ownership when /// updateLinkCells moves a formerly local atom into a halo link cell. /// The new owner accepts ownership when unloadAtomsBuffer calls /// putAtomInBox to place a received atom into a local link cell. /// /// This constructor does the following: /// /// - Sets the bufCapacity to hold the largest possible number of atoms /// that can be sent across a face. /// - Initialize function pointers to the atom-specific versions /// - Sets the number of link cells to send across each face. /// - Builds the list of link cells to send across each face. As /// explained in the comments for mkAtomCellList, this list must /// include any link cell, local or halo, that could possibly contain /// an atom that needs to be sent across the face. Atoms that need to /// be sent include "ghost atoms" that are located in local link /// cells that correspond to halo link cells on receiving tasks as well as /// formerly local atoms that have just moved into halo link cells and /// need to be sent to the rank that owns the spatial domain the atom /// has moved into. /// - Sets a coordinate shift factor for each face to account for /// periodic boundary conditions. For most faces the factor is zero. /// For faces on the +x, +y, or +z face of the simulation domain /// the factor is -1.0 (to shift the coordinates by -1 times the /// simulation domain size). For -x, -y, and -z faces of the /// simulation domain, the factor is +1.0. /// /// \see redistributeAtoms HaloExchange* initAtomHaloExchange(Domain* domain, LinkCell* boxes) { HaloExchange* hh = initHaloExchange(domain); int size0 = (boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = maxSize*2*MAXATOMS*sizeof(AtomMsg); hh->loadBuffer = loadAtomsBuffer; hh->unloadBuffer = unloadAtomsBuffer; hh->destroy = destroyAtomsExchange; AtomExchangeParms* parms = comdMalloc(sizeof(AtomExchangeParms)); parms->nCells[HALO_X_MINUS] = 2*(boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); parms->nCells[HALO_Y_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); parms->nCells[HALO_Z_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) parms->cellList[ii] = mkAtomCellList(boxes, ii, parms->nCells[ii]); for (int ii=0; ii<6; ++ii) { parms->pbcFactor[ii] = comdMalloc(3*sizeof(real_t)); for (int jj=0; jj<3; ++jj) parms->pbcFactor[ii][jj] = 0.0; } int* procCoord = domain->procCoord; //alias int* procGrid = domain->procGrid; //alias if (procCoord[HALO_X_AXIS] == 0) parms->pbcFactor[HALO_X_MINUS][HALO_X_AXIS] = +1.0; if (procCoord[HALO_X_AXIS] == procGrid[HALO_X_AXIS]-1) parms->pbcFactor[HALO_X_PLUS][HALO_X_AXIS] = -1.0; if (procCoord[HALO_Y_AXIS] == 0) parms->pbcFactor[HALO_Y_MINUS][HALO_Y_AXIS] = +1.0; if (procCoord[HALO_Y_AXIS] == procGrid[HALO_Y_AXIS]-1) parms->pbcFactor[HALO_Y_PLUS][HALO_Y_AXIS] = -1.0; if (procCoord[HALO_Z_AXIS] == 0) parms->pbcFactor[HALO_Z_MINUS][HALO_Z_AXIS] = +1.0; if (procCoord[HALO_Z_AXIS] == procGrid[HALO_Z_AXIS]-1) parms->pbcFactor[HALO_Z_PLUS][HALO_Z_AXIS] = -1.0; hh->parms = parms; return hh; }
/// The force exchange is considerably simpler than the atom exchange. /// In the force case we only need to exchange data that is needed to /// complete the force calculation. Since the atoms have not moved we /// only need to send data from local link cells and we are guaranteed /// that the same atoms exist in the same order in corresponding halo /// cells on remote tasks. The only tricky part is the size of the /// plane of local cells that needs to be sent grows in each direction. /// This is because the y-axis send must send some of the data that was /// received from the x-axis send, and the z-axis must send some data /// from the y-axis send. This accumulation of data to send is /// responsible for data reaching neighbor cells that share only edges /// or corners. /// /// \see eam.c for an explanation of the requirement to exchange /// force data. HaloExchange* initForceHaloExchange(Domain* domain, LinkCell* boxes, int useGPU) { HaloExchange* hh = initHaloExchange(domain); if(useGPU){ hh->loadBuffer = loadForceBuffer; hh->unloadBuffer = unloadForceBuffer; }else{ hh->loadBuffer = loadForceBufferCpu; hh->unloadBuffer = unloadForceBufferCpu; } hh->destroy = destroyForceExchange; int size0 = (boxes->gridSize[1])*(boxes->gridSize[2]); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = (maxSize)*MAXATOMS*sizeof(ForceMsg); hh->sendBufM = (char*)comdMalloc(hh->bufCapacity); hh->sendBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufM = (char*)comdMalloc(hh->bufCapacity); // pin memory cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0); cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0); ForceExchangeParms* parms = (ForceExchangeParms*)comdMalloc(sizeof(ForceExchangeParms)); parms->nCells[HALO_X_MINUS] = (boxes->gridSize[1] )*(boxes->gridSize[2] ); parms->nCells[HALO_Y_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[2] ); parms->nCells[HALO_Z_MINUS] = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->sendCells[ii] = mkForceSendCellList(boxes, ii, parms->nCells[ii]); parms->recvCells[ii] = mkForceRecvCellList(boxes, ii, parms->nCells[ii]); // copy cell list to gpu cudaMalloc((void**)&parms->sendCellsGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMalloc((void**)&parms->recvCellsGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMemcpy(parms->sendCellsGpu[ii], parms->sendCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(parms->recvCellsGpu[ii], parms->recvCells[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); // allocate temp buf int size = parms->nCells[ii]+1; if (size % 256 != 0) size = ((size + 255)/256)*256; cudaMalloc((void**)&parms->natoms_buf[ii], size * sizeof(int)); cudaMalloc((void**)&parms->partial_sums[ii], (size/256 + 1) * sizeof(int)); } hh->hashTable = NULL; hh->type = 1; hh->parms = parms; return hh; }
/// \details /// When called in proper sequence by redistributeAtoms, the atom halo /// exchange helps serve three purposes: /// - Send ghost atom data to neighbor tasks. /// - Shift atom coordinates by the global simulation size when they cross /// periodic boundaries. This shift is performed in loadAtomsBuffer. /// - Transfer ownership of atoms between tasks as the atoms move across /// spatial domain boundaries. This transfer of ownership occurs in /// two places. The former owner gives up ownership when /// updateLinkCells moves a formerly local atom into a halo link cell. /// The new owner accepts ownership when unloadAtomsBuffer calls /// putAtomInBox to place a received atom into a local link cell. /// /// This constructor does the following: /// /// - Sets the bufCapacity to hold the largest possible number of atoms /// that can be sent across a face. /// - Initialize function pointers to the atom-specific versions /// - Sets the number of link cells to send across each face. /// - Builds the list of link cells to send across each face. As /// explained in the comments for mkAtomCellList, this list must /// include any link cell, local or halo, that could possibly contain /// an atom that needs to be sent across the face. Atoms that need to /// be sent include "ghost atoms" that are located in local link /// cells that correspond to halo link cells on receiving tasks as well as /// formerly local atoms that have just moved into halo link cells and /// need to be sent to the rank that owns the spatial domain the atom /// has moved into. /// - Sets a coordinate shift factor for each face to account for /// periodic boundary conditions. For most faces the factor is zero. /// For faces on the +x, +y, or +z face of the simulation domain /// the factor is -1.0 (to shift the coordinates by -1 times the /// simulation domain size). For -x, -y, and -z faces of the /// simulation domain, the factor is +1.0. /// /// \see redistributeAtoms HaloExchange* initAtomHaloExchange(Domain* domain, LinkCell* boxes) { HaloExchange* hh = initHaloExchange(domain); int size0 = (boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); int size1 = (boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); int size2 = (boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); int maxSize = MAX(size0, size1); maxSize = MAX(size1, size2); hh->bufCapacity = maxSize*2*MAXATOMS*sizeof(AtomMsg); hh->sendBufM = (char*)comdMalloc(hh->bufCapacity); hh->sendBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufP = (char*)comdMalloc(hh->bufCapacity); hh->recvBufM = (char*)comdMalloc(hh->bufCapacity); // pin memory cudaHostRegister(hh->sendBufM, hh->bufCapacity, 0); cudaHostRegister(hh->sendBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufP, hh->bufCapacity, 0); cudaHostRegister(hh->recvBufM, hh->bufCapacity, 0); hh->loadBuffer = loadAtomsBuffer; hh->unloadBuffer = unloadAtomsBuffer; hh->destroy = destroyAtomsExchange; hh->hashTable = initHashTable((boxes->nTotalBoxes - boxes->nLocalBoxes) * MAXATOMS * 2); AtomExchangeParms* parms = (AtomExchangeParms*)comdMalloc(sizeof(AtomExchangeParms)); parms->nCells[HALO_X_MINUS] = 2*(boxes->gridSize[1]+2)*(boxes->gridSize[2]+2); parms->nCells[HALO_Y_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[2]+2); parms->nCells[HALO_Z_MINUS] = 2*(boxes->gridSize[0]+2)*(boxes->gridSize[1]+2); parms->nCells[HALO_X_PLUS] = parms->nCells[HALO_X_MINUS]; parms->nCells[HALO_Y_PLUS] = parms->nCells[HALO_Y_MINUS]; parms->nCells[HALO_Z_PLUS] = parms->nCells[HALO_Z_MINUS]; for (int ii=0; ii<6; ++ii) { parms->cellList[ii] = mkAtomCellList(boxes, (enum HaloFaceOrder)ii, parms->nCells[ii]); // copy cell list to gpu cudaMalloc((void**)&parms->cellListGpu[ii], parms->nCells[ii] * sizeof(int)); cudaMemcpy(parms->cellListGpu[ii], parms->cellList[ii], parms->nCells[ii] * sizeof(int), cudaMemcpyHostToDevice); } // allocate scan buf int size = boxes->nLocalBoxes+1; if (size % 256 != 0) size = ((size + 255)/256)*256; int partial_size = size/256 + 1; if (partial_size % 256 != 0) partial_size = ((partial_size + 255)/256)*256; cudaMalloc((void**)&parms->d_natoms_buf, size * sizeof(int)); parms->h_natoms_buf = (int*) malloc( size * sizeof(int)); cudaMalloc((void**)&parms->d_partial_sums, partial_size * sizeof(int)); for (int ii=0; ii<6; ++ii) { parms->pbcFactor[ii] = (real_t*)comdMalloc(3*sizeof(real_t)); for (int jj=0; jj<3; ++jj) parms->pbcFactor[ii][jj] = 0.0; } int* procCoord = domain->procCoord; //alias int* procGrid = domain->procGrid; //alias if (procCoord[HALO_X_AXIS] == 0) parms->pbcFactor[HALO_X_MINUS][HALO_X_AXIS] = +1.0; if (procCoord[HALO_X_AXIS] == procGrid[HALO_X_AXIS]-1) parms->pbcFactor[HALO_X_PLUS][HALO_X_AXIS] = -1.0; if (procCoord[HALO_Y_AXIS] == 0) parms->pbcFactor[HALO_Y_MINUS][HALO_Y_AXIS] = +1.0; if (procCoord[HALO_Y_AXIS] == procGrid[HALO_Y_AXIS]-1) parms->pbcFactor[HALO_Y_PLUS][HALO_Y_AXIS] = -1.0; if (procCoord[HALO_Z_AXIS] == 0) parms->pbcFactor[HALO_Z_MINUS][HALO_Z_AXIS] = +1.0; if (procCoord[HALO_Z_AXIS] == procGrid[HALO_Z_AXIS]-1) parms->pbcFactor[HALO_Z_PLUS][HALO_Z_AXIS] = -1.0; hh->type = 0; hh->parms = parms; return hh; }