void fillGaps() { AreaMapping<AREA, MappingDesc> mapper(this->cellDescription); __cudaKernel(kernelFillGaps) (mapper.getGridDim(), TileSize) (particlesBuffer->getDeviceParticleBox(), mapper); __cudaKernel(kernelFillGapsLastFrame) (mapper.getGridDim(), TileSize) (particlesBuffer->getDeviceParticleBox(), mapper); }
static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<simDim> globalCell) { const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid(); const DataSpace<simDim> globalTopLeft = subGrid.getLocalDomain().offset; const DataSpace<simDim> localSimulationArea = subGrid.getLocalDomain().size; DataSpace<simDim> localParCell = globalCell - globalTopLeft; for (int i = 0; i < (int) simDim; ++i) { //chek if particle is in the simulation area if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i]) return; } //calculate supercell DataSpace<simDim> localSuperCell = (localParCell / MappingDesc::SuperCellSize::toRT()); DataSpace<simDim> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::toRT()); //add garding blocks to supercell localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells(); __cudaKernel(kernelAddOneParticle) (1, 1) (parClass.getDeviceParticlesBox(), localSuperCell, cellInSuperCell); parClass.fillAllGaps(); std::cout << "Wait for add particle" << std::endl; __getTransactionEvent().waitForFinished(); }
static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<DIM3> globalCell) { PMACC_AUTO(simBox, SubGrid<simDim>::getInstance().getSimulationBox()); const DataSpace<DIM3> globalTopLeft = simBox.getGlobalOffset(); const DataSpace<DIM3> localSimulationArea = simBox.getLocalSize(); DataSpace<DIM3> localParCell = globalCell - globalTopLeft; for (int i = 0; i < (int) DIM3; ++i) { //chek if particle is in the simulation area if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i]) return; } //calculate supercell DataSpace<DIM3> localSuperCell = (localParCell / MappingDesc::SuperCellSize::getDataSpace()); DataSpace<DIM3> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::getDataSpace()); //add garding blocks to supercell localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells(); __cudaKernel(kernelAddOneParticle) (1, 1) (parClass.getDeviceParticlesBox(), localSuperCell, cellInSuperCell); parClass.fillAllGaps(); std::cout << "Wait for add particle" << std::endl; __getTransactionEvent().waitForFinished(); }
void shiftParticles() { StrideMapping<AREA, DIM3, MappingDesc> mapper(this->cellDescription); ParticlesBoxType pBox = particlesBuffer->getDeviceParticleBox(); __startTransaction(__getTransactionEvent()); do { __cudaKernel(kernelShiftParticles) (mapper.getGridDim(), TileSize) (pBox, mapper); } while (mapper.next()); __setTransactionEvent(__endTransaction()); }
static uint64_cu countOnDevice(PBuffer& buffer, CellDesc cellDescription, Filter filter) { GridBuffer<uint64_cu, DIM1> counter(DataSpace<DIM1>(1)); dim3 block(CellDesc::SuperCellSize::toRT().toDim3()); AreaMapping<AREA, CellDesc> mapper(cellDescription); __cudaKernel(kernelCountParticles) (mapper.getGridDim(), block) (buffer.getDeviceParticlesBox(), counter.getDeviceBuffer().getBasePointer(), filter, mapper); counter.deviceToHost(); return *(counter.getHostBuffer().getDataBox()); }
HINLINE typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n) { /* - the result of a functor can be a reference or a const value * - it is not allowed to create const or reference memory * thus we remove `references` and `const` qualifiers */ typedef typename boost::remove_const< typename boost::remove_reference< typename traits::GetValueType<Src>::ValueType >::type >::type Type; uint32_t blockcount = optimalThreadsPerBlock(n, sizeof (Type)); uint32_t n_buffer = byte / sizeof (Type); uint32_t threads = n_buffer * blockcount * 2; /* x2 is used thus we can use all byte in Buffer, after we calculate threads/2 */ if (threads > n) threads = n; Type* dest = (Type*) reduceBuffer->getDeviceBuffer().getBasePointer(); uint32_t blocks = threads / 2 / blockcount; if (blocks == 0) blocks = 1; __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(src, n, dest, func, PMacc::nvidia::functors::Assign()); n = blocks; blockcount = optimalThreadsPerBlock(n, sizeof (Type)); blocks = n / 2 / blockcount; if (blocks == 0 && n > 1) blocks = 1; while (blocks != 0) { if (blocks > 1) { uint32_t blockOffset = ceil((double) blocks / blockcount); uint32_t useBlocks = blocks - blockOffset; uint32_t problemSize = n - (blockOffset * blockcount); Type* srcPtr = dest + (blockOffset * blockcount); __cudaKernel((kernel::reduce < Type >))(useBlocks, blockcount, blockcount * sizeof (Type))(srcPtr, problemSize, dest, func, func); blocks = blockOffset*blockcount; } else { __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(dest, n, dest, func, PMacc::nvidia::functors::Assign()); } n = blocks; blockcount = optimalThreadsPerBlock(n, sizeof (Type)); blocks = n / 2 / blockcount; if (blocks == 0 && n > 1) blocks = 1; } reduceBuffer->deviceToHost(); __getTransactionEvent().waitForFinished(); return *((Type*) (reduceBuffer->getHostBuffer().getBasePointer())); }