/** * Copies this data and the RingBuffer data from host to device. */ void hostToDevice() { __startTransaction(__getTransactionEvent()); ringBuffer->hostToDevice(); EventTask ev1 = __endTransaction(); __startTransaction(__getTransactionEvent()); GridBuffer<VALUE, DIM1, BORDERVALUE>::hostToDevice(); __setTransactionEvent(__endTransaction() + ev1); }
/** * Copies data and additional pointers from host to device. */ void hostToDevice() { __startTransaction(__getTransactionEvent()); ringDataSizes->hostToDevice(); EventTask ev1 = __endTransaction(); __startTransaction(__getTransactionEvent()); ringData->hostToDevice(); __setTransactionEvent(__endTransaction() + ev1); }
/** * Copies data and additional pointers from device to host. */ void deviceToHost() { __startTransaction(__getTransactionEvent()); ringDataSizes->deviceToHost(); EventTask ev1 = __endTransaction(); __startTransaction(__getTransactionEvent()); ringData->deviceToHost(); __setTransactionEvent(__endTransaction() + ev1); }
/** * Notifies registered output classes. * * This function is called automatically. * * @param currentStep simulation step */ virtual void dumpOneStep(uint32_t currentStep) { /* trigger notification */ Environment<DIM>::get().PluginConnector().notifyPlugins(currentStep); /* trigger checkpoint notification */ if( !checkpointPeriod.empty() && pluginSystem::containsStep( seqCheckpointPeriod, currentStep ) ) { /* first synchronize: if something failed, we can spare the time * for the checkpoint writing */ CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaGetLastError()); // avoid deadlock between not finished PMacc tasks and MPI_Barrier __getTransactionEvent().waitForFinished(); GridController<DIM> &gc = Environment<DIM>::get().GridController(); /* can be spared for better scalings, but allows to spare the * time for checkpointing if some ranks died */ MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); /* create directory containing checkpoints */ if (numCheckpoints == 0) { Environment<DIM>::get().Filesystem().createDirectoryWithPermissions(checkpointDirectory); } Environment<DIM>::get().PluginConnector().checkpointPlugins(currentStep, checkpointDirectory); /* important synchronize: only if no errors occured until this * point guarantees that a checkpoint is usable */ CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaGetLastError()); /* avoid deadlock between not finished PMacc tasks and MPI_Barrier */ __getTransactionEvent().waitForFinished(); /* \todo in an ideal world with MPI-3, this would be an * MPI_Ibarrier call and this function would return a MPI_Request * that could be checked */ MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm())); if (gc.getGlobalRank() == 0) { writeCheckpointStep(currentStep); } numCheckpoints++; } }
/** * Copies this data and the RingBuffer data from device to host. */ void deviceToHost() { __startTransaction(__getTransactionEvent()); ringBuffer->deviceToHost(); EventTask ev1 = __endTransaction(); __startTransaction(__getTransactionEvent()); GridBuffer<VALUE, DIM1, BORDERVALUE>::deviceToHost(); EventTask ev2 = __endTransaction(); __setTransactionEvent(ev1 + ev2); }
/* host constructor initializing member : random number generator */ ThomasFermi_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>()) { /* create handle for access to host and device data */ DataConnector &dc = Environment<>::get().DataConnector(); /* The compiler is allowed to evaluate an expression that does not depend on a template parameter * even if the class is never instantiated. In that case static assert is always * evaluated (e.g. with clang), this results in an error if the condition is false. * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html * * A workaround is to add a template dependency to the expression. * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation. */ PMACC_CASSERT_MSG( _please_allocate_at_least_two_FieldTmp_slots_in_memory_param, ( fieldTmpNumSlots >= 2 ) && ( sizeof( T_IonizationAlgorithm ) != 0 ) ); /* initialize pointers on host-side density-/energy density field databoxes */ auto density = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true ); auto eneKinDens = dc.get< FieldTmp >( FieldTmp::getUniqueId( 1 ), true ); /* reset density and kinetic energy values to zero */ density->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType( 0. ) ); eneKinDens->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType( 0. ) ); /* load species without copying the particle data to the host */ auto srcSpecies = dc.get< SrcSpecies >( SrcSpecies::FrameType::getName(), true ); /* kernel call for weighted ion density calculation */ density->template computeValue< CORE + BORDER, DensitySolver >(*srcSpecies, currentStep); dc.releaseData( SrcSpecies::FrameType::getName() ); EventTask densityEvent = density->asyncCommunication( __getTransactionEvent() ); densityEvent += density->asyncCommunicationGather( densityEvent ); /* load species without copying the particle data to the host */ auto destSpecies = dc.get< DestSpecies >( DestSpecies::FrameType::getName(), true ); /* kernel call for weighted electron energy density calculation */ eneKinDens->template computeValue< CORE + BORDER, EnergyDensitySolver >(*destSpecies, currentStep); dc.releaseData( DestSpecies::FrameType::getName() ); EventTask eneKinEvent = eneKinDens->asyncCommunication( __getTransactionEvent() ); eneKinEvent += eneKinDens->asyncCommunicationGather( eneKinEvent ); /* contributions from neighboring GPUs to our border area */ __setTransactionEvent( densityEvent + eneKinEvent ); /* initialize device-side density- and energy density field databox pointers */ rhoBox = density->getDeviceDataBox(); eneBox = eneKinDens->getDeviceDataBox(); }
static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<DIM3> globalCell) { PMACC_AUTO(simBox, SubGrid<simDim>::getInstance().getSimulationBox()); const DataSpace<DIM3> globalTopLeft = simBox.getGlobalOffset(); const DataSpace<DIM3> localSimulationArea = simBox.getLocalSize(); DataSpace<DIM3> localParCell = globalCell - globalTopLeft; for (int i = 0; i < (int) DIM3; ++i) { //chek if particle is in the simulation area if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i]) return; } //calculate supercell DataSpace<DIM3> localSuperCell = (localParCell / MappingDesc::SuperCellSize::getDataSpace()); DataSpace<DIM3> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::getDataSpace()); //add garding blocks to supercell localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells(); __cudaKernel(kernelAddOneParticle) (1, 1) (parClass.getDeviceParticlesBox(), localSuperCell, cellInSuperCell); parClass.fillAllGaps(); std::cout << "Wait for add particle" << std::endl; __getTransactionEvent().waitForFinished(); }
void operator()(ThreadParams& params, const std::string& name, T_Scalar* value, const std::string& attrName = "", T_Attribute* attribute = nullptr) { log<picLog::INPUT_OUTPUT>("HDF5: read %1%D scalars: %2%") % simDim % name; Dimensions domain_offset(0, 0, 0); for (uint32_t d = 0; d < simDim; ++d) domain_offset[d] = Environment<simDim>::get().GridController().getPosition()[d]; // avoid deadlock between not finished pmacc tasks and mpi calls in adios __getTransactionEvent().waitForFinished(); DomainCollector::DomDataClass data_class; DataContainer *dataContainer = params.dataCollector->readDomain(params.currentStep, name.c_str(), Domain(domain_offset, Dimensions(1, 1, 1)), &data_class); typename traits::PICToSplash<T_Scalar>::type splashType; *value = *static_cast<T_Scalar*>(dataContainer->getIndex(0)->getData()); __delete(dataContainer); if(!attrName.empty()) { log<picLog::INPUT_OUTPUT>("HDF5: read attribute %1% for scalars: %2%") % attrName % name; params.dataCollector->readAttributeInfo(params.currentStep, name.c_str(), attrName.c_str()).read(attribute, sizeof(T_Attribute)); log<picLog::INPUT_OUTPUT>("HDF5: attribute %1% = %2%") % attrName % *attribute; } }
TaskFieldReceiveAndInsertExchange(Field &buffer, uint32_t exchange) : m_buffer(buffer), m_exchange(exchange), m_state(Constructor), initDependency(__getTransactionEvent()) { }
static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<simDim> globalCell) { const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid(); const DataSpace<simDim> globalTopLeft = subGrid.getLocalDomain().offset; const DataSpace<simDim> localSimulationArea = subGrid.getLocalDomain().size; DataSpace<simDim> localParCell = globalCell - globalTopLeft; for (int i = 0; i < (int) simDim; ++i) { //chek if particle is in the simulation area if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i]) return; } //calculate supercell DataSpace<simDim> localSuperCell = (localParCell / MappingDesc::SuperCellSize::toRT()); DataSpace<simDim> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::toRT()); //add garding blocks to supercell localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells(); __cudaKernel(kernelAddOneParticle) (1, 1) (parClass.getDeviceParticlesBox(), localSuperCell, cellInSuperCell); parClass.fillAllGaps(); std::cout << "Wait for add particle" << std::endl; __getTransactionEvent().waitForFinished(); }
void oneStep(uint32_t currentStep, Buffer* read, Buffer* write) { PMACC_AUTO(splitEvent, __getTransactionEvent()); /* GridBuffer 'read' will use 'splitEvent' to schedule transaction * * tasks from the Guard of this local Area to the Borders of the * * neighboring areas added by 'addExchange'. All transactions in * * Transaction Manager will then be done in parallel to the * * calculations in the core. In order to synchronize the data * * transfer for the case the core calculation is finished earlier, * * GridBuffer.asyncComm returns a transaction handle we can check */ PMACC_AUTO(send, read->asyncCommunication(splitEvent)); evo.run<CORE>( read->getDeviceBuffer().getDataBox(), write->getDeviceBuffer().getDataBox() ); /* Join communication with worker tasks, Now all next tasks run sequential */ __setTransactionEvent(send); /* Calculate Borders */ evo.run<BORDER>( read->getDeviceBuffer().getDataBox(), write->getDeviceBuffer().getDataBox() ); write->deviceToHost(); /* gather::operator() gathers all the buffers and assembles those to * * a complete picture discarding the guards. */ PMACC_AUTO(picture, gather(write->getHostBuffer().getDataBox())); PngCreator png; if (isMaster) png(currentStep, picture, gridSize); }
TaskReceiveParticlesExchange(ParBase &parBase, uint32_t exchange) : parBase(parBase), exchange(exchange), state(Constructor), maxSize(parBase.getParticlesBuffer().getReceiveExchangeStack(exchange).getMaxParticlesCount()), initDependency(__getTransactionEvent()), lastSize(0) { }
TaskFieldSendExchange(Field &buffer, uint32_t exchange) : buffer(buffer), exchange(exchange), state(Constructor), initDependency(__getTransactionEvent()) { }
/** Functor * * @param currentStep the current time step * @param speciesGroup naming for the group of species in T_SpeciesList */ void operator()( uint32_t currentStep, std::string const & speciesGroup ) { // generating a density requires at least one slot in FieldTmp PMACC_CASSERT_MSG( _please_allocate_at_least_one_FieldTmp_in_memory_param, fieldTmpNumSlots > 0 ); DataConnector &dc = Environment<>::get().DataConnector(); // load FieldTmp without copy data to host and zero it auto fieldTmp = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true ); using DensityValueType = typename FieldTmp::ValueType; fieldTmp->getGridBuffer().getDeviceBuffer().setValue( DensityValueType::create(0.0) ); // add density of each species in list to FieldTmp ForEach< SpeciesList, detail::AddSingleDensity< bmpl::_1 > > addSingleDensity; addSingleDensity( currentStep, forward( fieldTmp ) ); /* create valid density in the BORDER region * note: for average != supercell multiples the GUARD of fieldTmp * also needs to be filled in the communication above */ EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent()); __setTransactionEvent(fieldTmpEvent); /* average summed density in FieldTmp down to local resolution and * write in new field */ auto nlocal = dc.get< LocalDensity >( helperFields::LocalDensity::getName( speciesGroup ), true ); constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers< pmacc::math::CT::volume< SuperCellSize >::type::value >::value; PMACC_KERNEL( helperFields::KernelAverageDensity< numWorkers >{ } ) ( // one block per averaged density value nlocal->getGridBuffer().getGridLayout().getDataSpaceWithoutGuarding(), numWorkers ) ( // start in border (jump over GUARD area) fieldTmp->getDeviceDataBox().shift( SuperCellSize::toRT() * GuardSize::toRT() ), // start in border (has no GUARD area) nlocal->getGridBuffer().getDeviceBuffer( ).getDataBox( ) ); // release fields dc.releaseData( FieldTmp::getUniqueId( 0 ) ); dc.releaseData( helperFields::LocalDensity::getName( speciesGroup ) ); }
/** * Resets all internal buffers. */ void reset() { __startTransaction(__getTransactionEvent()); frames->reset(false); frames->initialFillBuffer(); EventTask ev1 = __endTransaction(); __startTransaction(__getTransactionEvent()); superCells->getDeviceBuffer().setValue(SuperCell<vint_t > ()); superCells->getHostBuffer().setValue(SuperCell<vint_t > ()); /*nextFrames->getDeviceBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX prevFrames->getDeviceBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX nextFrames->getHostBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX prevFrames->getHostBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX */ __setTransactionEvent(__endTransaction() + ev1); }
HINLINE void operator()(Functor, Type* dest, Type* src, const size_t count, MPI_Datatype type, MPI_Op op, MPI_Comm comm) const { // avoid deadlock between not finished pmacc tasks and mpi blocking collectives __getTransactionEvent().waitForFinished(); MPI_CHECK(MPI_Allreduce((void*) src, (void*) dest, count, type, op, comm)); }
/** * Starts copying data from device to host. */ void deviceToHost() { __startTransaction(__getTransactionEvent()); frames->deviceToHost(); EventTask ev1 = __endTransaction(); __startTransaction(__getTransactionEvent()); superCells->deviceToHost(); EventTask ev2 = __endTransaction(); __startTransaction(__getTransactionEvent()); nextFrames->deviceToHost(); EventTask ev3 = __endTransaction(); __startTransaction(__getTransactionEvent()); prevFrames->deviceToHost(); EventTask ev4 = __endTransaction(); __setTransactionEvent(ev1 + ev2 + ev3 + ev4); }
void operator()(ThreadParams& params, const std::string& name, T_Scalar value, const std::string& attrName = "", T_Attribute attribute = T_Attribute()) { log<picLog::INPUT_OUTPUT>("HDF5: write %1%D scalars: %2%") % simDim % name; // Size over all processes Dimensions globalSize(1, 1, 1); // Offset for this process Dimensions localOffset(0, 0, 0); // Offset for all processes Dimensions globalOffset(0, 0, 0); for (uint32_t d = 0; d < simDim; ++d) { globalSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d]; localOffset[d] = Environment<simDim>::get().GridController().getPosition()[d]; } Dimensions localSize(1, 1, 1); // avoid deadlock between not finished pmacc tasks and mpi calls in adios __getTransactionEvent().waitForFinished(); typename traits::PICToSplash<T_Scalar>::type splashType; params.dataCollector->writeDomain(params.currentStep, /* id == time step */ globalSize, /* total size of dataset over all processes */ localOffset, /* write offset for this process */ splashType, /* data type */ simDim, /* NDims spatial dimensionality of the field */ splash::Selection(localSize), /* data size of this process */ name.c_str(), /* data set name */ splash::Domain( globalOffset, /* offset of the global domain */ globalSize /* size of the global domain */ ), DomainCollector::GridType, &value); if(!attrName.empty()) { /*simulation attribute for data*/ typename traits::PICToSplash<T_Attribute>::type attType; log<picLog::INPUT_OUTPUT>("HDF5: write attribute %1% for scalars: %2%") % attrName % name; params.dataCollector->writeAttribute(params.currentStep, attType, name.c_str(), attrName.c_str(), &attribute); } }
virtual void init() { state = Init; EventTask serialEvent = __getTransactionEvent(); for (int i = 1; i < Exchanges; ++i) { if (buffer.getGridBuffer().hasSendExchange(i)) { __startAtomicTransaction(serialEvent); FieldFactory::getInstance().createTaskFieldSendExchange(buffer, i); tmpEvent += __endTransaction(); } } state = WaitForSend; }
void shiftParticles() { StrideMapping<AREA, DIM3, MappingDesc> mapper(this->cellDescription); ParticlesBoxType pBox = particlesBuffer->getDeviceParticleBox(); __startTransaction(__getTransactionEvent()); do { __cudaKernel(kernelShiftParticles) (mapper.getGridDim(), TileSize) (pBox, mapper); } while (mapper.next()); __setTransactionEvent(__endTransaction()); }
/** Read the skalar field and optionally the attribute into the values referenced by the pointers */ void operator()(ThreadParams& params, const std::string& name, T_Scalar* value, const std::string& attrName = "", T_Attribute* attribute = nullptr) { log<picLog::INPUT_OUTPUT> ("ADIOS: read %1%D scalars: %2%") % simDim % name; std::string datasetName = params.adiosBasePath + name; ADIOS_VARINFO* varInfo; ADIOS_CMD_EXPECT_NONNULL( varInfo = adios_inq_var(params.fp, datasetName.c_str()) ); if(varInfo->ndim != simDim) throw std::runtime_error(std::string("Invalid dimensionality for ") + name); if(varInfo->type != traits::PICToAdios<T_Scalar>().type) throw std::runtime_error(std::string("Invalid type for ") + name); DataSpace<simDim> gridPos = Environment<simDim>::get().GridController().getPosition(); uint64_t start[varInfo->ndim]; uint64_t count[varInfo->ndim]; for(int d = 0; d < varInfo->ndim; ++d) { /* \see adios_define_var: z,y,x in C-order */ start[d] = gridPos.revert()[d]; count[d] = 1; } ADIOS_SELECTION* fSel = adios_selection_boundingbox(varInfo->ndim, start, count); // avoid deadlock between not finished pmacc tasks and mpi calls in adios __getTransactionEvent().waitForFinished(); /* specify what we want to read, but start reading at below at `adios_perform_reads` */ /* magic parameters (0, 1): `from_step` (not used in streams), `nsteps` to read (must be 1 for stream) */ log<picLog::INPUT_OUTPUT > ("ADIOS: Schedule read skalar %1%)") % datasetName; ADIOS_CMD( adios_schedule_read(params.fp, fSel, datasetName.c_str(), 0, 1, (void*)value) ); /* start a blocking read of all scheduled variables */ ADIOS_CMD( adios_perform_reads(params.fp, 1) ); adios_selection_delete(fSel); adios_free_varinfo(varInfo); if(!attrName.empty()) { log<picLog::INPUT_OUTPUT> ("ADIOS: read attribute %1% for scalars: %2%") % attrName % name; *attribute = readAttribute<T_Attribute>(params.fp, datasetName, attrName); } }
void setCurrentSize(const size_t size) { // do host and device setCurrentSize parallel EventTask split = __getTransactionEvent(); __startTransaction(split); stackIndexer.getHostBuffer().setCurrentSize(size); stack.getHostBuffer().setCurrentSize(size); EventTask e1 = __endTransaction(); __startTransaction(split); stackIndexer.getDeviceBuffer().setCurrentSize(size); EventTask e2 = __endTransaction(); __startTransaction(split); stack.getDeviceBuffer().setCurrentSize(size); EventTask e3 = __endTransaction(); __setTransactionEvent(e1 + e2 + e3); }
HINLINE void operator()( T_StorageTuple& tuple, const uint32_t currentStep, const T_Event eventInt, T_Event& updateEvent, T_Event& commEvent ) const { typedef typename HasFlag<FrameType, particlePusher<> >::type hasPusher; if (hasPusher::value) { PMACC_AUTO(speciesPtr, tuple[SpeciesName()]); __startTransaction(eventInt); speciesPtr->update(currentStep); commEvent += speciesPtr->asyncCommunication(__getTransactionEvent()); updateEvent += __endTransaction(); } }
/** * Starts sync data from own device buffer to neigbhor device buffer. * * Asynchronously starts syncronization data from internal DeviceBuffer using added * Exchange buffers. * This operation runs sequential to other code but intern asyncron * */ EventTask communication() { EventTask ev = this->asyncCommunication(__getTransactionEvent()); __setTransactionEvent(ev); return ev; }
HINLINE typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n) { /* - the result of a functor can be a reference or a const value * - it is not allowed to create const or reference memory * thus we remove `references` and `const` qualifiers */ typedef typename boost::remove_const< typename boost::remove_reference< typename traits::GetValueType<Src>::ValueType >::type >::type Type; uint32_t blockcount = optimalThreadsPerBlock(n, sizeof (Type)); uint32_t n_buffer = byte / sizeof (Type); uint32_t threads = n_buffer * blockcount * 2; /* x2 is used thus we can use all byte in Buffer, after we calculate threads/2 */ if (threads > n) threads = n; Type* dest = (Type*) reduceBuffer->getDeviceBuffer().getBasePointer(); uint32_t blocks = threads / 2 / blockcount; if (blocks == 0) blocks = 1; __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(src, n, dest, func, PMacc::nvidia::functors::Assign()); n = blocks; blockcount = optimalThreadsPerBlock(n, sizeof (Type)); blocks = n / 2 / blockcount; if (blocks == 0 && n > 1) blocks = 1; while (blocks != 0) { if (blocks > 1) { uint32_t blockOffset = ceil((double) blocks / blockcount); uint32_t useBlocks = blocks - blockOffset; uint32_t problemSize = n - (blockOffset * blockcount); Type* srcPtr = dest + (blockOffset * blockcount); __cudaKernel((kernel::reduce < Type >))(useBlocks, blockcount, blockcount * sizeof (Type))(srcPtr, problemSize, dest, func, func); blocks = blockOffset*blockcount; } else { __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(dest, n, dest, func, PMacc::nvidia::functors::Assign()); } n = blocks; blockcount = optimalThreadsPerBlock(n, sizeof (Type)); blocks = n / 2 / blockcount; if (blocks == 0 && n > 1) blocks = 1; } reduceBuffer->deviceToHost(); __getTransactionEvent().waitForFinished(); return *((Type*) (reduceBuffer->getHostBuffer().getBasePointer())); }