Exemple #1
0
 /**
  * Copies this data and the RingBuffer data from host to device.
  */
 void hostToDevice()
 {
     __startTransaction(__getTransactionEvent());
     ringBuffer->hostToDevice();
     EventTask ev1 = __endTransaction();
     __startTransaction(__getTransactionEvent());
     GridBuffer<VALUE, DIM1, BORDERVALUE>::hostToDevice();
     __setTransactionEvent(__endTransaction() + ev1);
 }
Exemple #2
0
 /**
  * Copies data and additional pointers from host to device.
  */
 void hostToDevice()
 {
     __startTransaction(__getTransactionEvent());
     ringDataSizes->hostToDevice();
     EventTask ev1 = __endTransaction();
     __startTransaction(__getTransactionEvent());
     ringData->hostToDevice();
     __setTransactionEvent(__endTransaction() + ev1);
 }
Exemple #3
0
 /**
  * Copies data and additional pointers from device to host.
  */
 void deviceToHost()
 {
     __startTransaction(__getTransactionEvent());
     ringDataSizes->deviceToHost();
     EventTask ev1 = __endTransaction();
     __startTransaction(__getTransactionEvent());
     ringData->deviceToHost();
     __setTransactionEvent(__endTransaction() + ev1);
 }
Exemple #4
0
    /**
     * Notifies registered output classes.
     *
     * This function is called automatically.
     *
     *  @param currentStep simulation step
     */
    virtual void dumpOneStep(uint32_t currentStep)
    {
        /* trigger notification */
        Environment<DIM>::get().PluginConnector().notifyPlugins(currentStep);

        /* trigger checkpoint notification */
        if(
            !checkpointPeriod.empty() &&
            pluginSystem::containsStep(
                seqCheckpointPeriod,
                currentStep
            )
        )
        {
            /* first synchronize: if something failed, we can spare the time
             * for the checkpoint writing */
            CUDA_CHECK(cudaDeviceSynchronize());
            CUDA_CHECK(cudaGetLastError());

            // avoid deadlock between not finished PMacc tasks and MPI_Barrier
            __getTransactionEvent().waitForFinished();

            GridController<DIM> &gc = Environment<DIM>::get().GridController();
            /* can be spared for better scalings, but allows to spare the
             * time for checkpointing if some ranks died */
            MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));

            /* create directory containing checkpoints  */
            if (numCheckpoints == 0)
            {
                Environment<DIM>::get().Filesystem().createDirectoryWithPermissions(checkpointDirectory);
            }

            Environment<DIM>::get().PluginConnector().checkpointPlugins(currentStep,
                                                                        checkpointDirectory);

            /* important synchronize: only if no errors occured until this
             * point guarantees that a checkpoint is usable */
            CUDA_CHECK(cudaDeviceSynchronize());
            CUDA_CHECK(cudaGetLastError());

            /* avoid deadlock between not finished PMacc tasks and MPI_Barrier */
            __getTransactionEvent().waitForFinished();

            /* \todo in an ideal world with MPI-3, this would be an
             * MPI_Ibarrier call and this function would return a MPI_Request
             * that could be checked */
            MPI_CHECK(MPI_Barrier(gc.getCommunicator().getMPIComm()));

            if (gc.getGlobalRank() == 0)
            {
                writeCheckpointStep(currentStep);
            }
            numCheckpoints++;
        }
    }
Exemple #5
0
    /**
     * Copies this data and the RingBuffer data from device to host.
     */
    void deviceToHost()
    {
        __startTransaction(__getTransactionEvent());
        ringBuffer->deviceToHost();
        EventTask ev1 = __endTransaction();

        __startTransaction(__getTransactionEvent());
        GridBuffer<VALUE, DIM1, BORDERVALUE>::deviceToHost();
        EventTask ev2 = __endTransaction();

        __setTransactionEvent(ev1 + ev2);
    }
            /* host constructor initializing member : random number generator */
            ThomasFermi_Impl(const uint32_t currentStep) : randomGen(RNGFactory::createRandom<Distribution>())
            {
                /* create handle for access to host and device data */
                DataConnector &dc = Environment<>::get().DataConnector();

                /* The compiler is allowed to evaluate an expression that does not depend on a template parameter
                 * even if the class is never instantiated. In that case static assert is always
                 * evaluated (e.g. with clang), this results in an error if the condition is false.
                 * http://www.boost.org/doc/libs/1_60_0/doc/html/boost_staticassert.html
                 *
                 * A workaround is to add a template dependency to the expression.
                 * `sizeof(ANY_TYPE) != 0` is always true and defers the evaluation.
                 */
                PMACC_CASSERT_MSG(
                    _please_allocate_at_least_two_FieldTmp_slots_in_memory_param,
                    ( fieldTmpNumSlots >= 2 ) && ( sizeof( T_IonizationAlgorithm ) != 0 )
                );
                /* initialize pointers on host-side density-/energy density field databoxes */
                auto density = dc.get< FieldTmp >( FieldTmp::getUniqueId( 0 ), true );
                auto eneKinDens = dc.get< FieldTmp >( FieldTmp::getUniqueId( 1 ), true );

                /* reset density and kinetic energy values to zero */
                density->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType( 0. ) );
                eneKinDens->getGridBuffer().getDeviceBuffer().setValue( FieldTmp::ValueType( 0. ) );

                /* load species without copying the particle data to the host */
                auto srcSpecies = dc.get< SrcSpecies >( SrcSpecies::FrameType::getName(), true );

                /* kernel call for weighted ion density calculation */
                density->template computeValue< CORE + BORDER, DensitySolver >(*srcSpecies, currentStep);
                dc.releaseData( SrcSpecies::FrameType::getName() );
                EventTask densityEvent = density->asyncCommunication( __getTransactionEvent() );
                densityEvent += density->asyncCommunicationGather( densityEvent );

                /* load species without copying the particle data to the host */
                auto destSpecies = dc.get< DestSpecies >( DestSpecies::FrameType::getName(), true );

                /* kernel call for weighted electron energy density calculation */
                eneKinDens->template computeValue< CORE + BORDER, EnergyDensitySolver >(*destSpecies, currentStep);
                dc.releaseData( DestSpecies::FrameType::getName() );
                EventTask eneKinEvent = eneKinDens->asyncCommunication( __getTransactionEvent() );
                eneKinEvent += eneKinDens->asyncCommunicationGather( eneKinEvent );

                /* contributions from neighboring GPUs to our border area */
                __setTransactionEvent( densityEvent + eneKinEvent );

                /* initialize device-side density- and energy density field databox pointers */
                rhoBox = density->getDeviceDataBox();
                eneBox = eneKinDens->getDeviceDataBox();

            }
    static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<DIM3> globalCell)
    {

        PMACC_AUTO(simBox, SubGrid<simDim>::getInstance().getSimulationBox());
        const DataSpace<DIM3> globalTopLeft = simBox.getGlobalOffset();
        const DataSpace<DIM3> localSimulationArea = simBox.getLocalSize();
        DataSpace<DIM3> localParCell = globalCell - globalTopLeft;


        for (int i = 0; i < (int) DIM3; ++i)
        {
            //chek if particle is in the simulation area
            if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i])
                return;
        }

        //calculate supercell 
        DataSpace<DIM3> localSuperCell = (localParCell / MappingDesc::SuperCellSize::getDataSpace());
        DataSpace<DIM3> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::getDataSpace());
        //add garding blocks to supercell 
        localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells();


        __cudaKernel(kernelAddOneParticle)
            (1, 1)
            (parClass.getDeviceParticlesBox(),
             localSuperCell, cellInSuperCell);

        parClass.fillAllGaps();

        std::cout << "Wait for add particle" << std::endl;
        __getTransactionEvent().waitForFinished();
    }
Exemple #8
0
    void operator()(ThreadParams& params,
                const std::string& name, T_Scalar* value,
                const std::string& attrName = "", T_Attribute* attribute = nullptr)
    {
        log<picLog::INPUT_OUTPUT>("HDF5: read %1%D scalars: %2%") % simDim % name;

        Dimensions domain_offset(0, 0, 0);
        for (uint32_t d = 0; d < simDim; ++d)
            domain_offset[d] = Environment<simDim>::get().GridController().getPosition()[d];

        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
        __getTransactionEvent().waitForFinished();

        DomainCollector::DomDataClass data_class;
        DataContainer *dataContainer =
            params.dataCollector->readDomain(params.currentStep,
                                               name.c_str(),
                                               Domain(domain_offset, Dimensions(1, 1, 1)),
                                               &data_class);

        typename traits::PICToSplash<T_Scalar>::type splashType;
        *value = *static_cast<T_Scalar*>(dataContainer->getIndex(0)->getData());
        __delete(dataContainer);

        if(!attrName.empty())
        {
            log<picLog::INPUT_OUTPUT>("HDF5: read attribute %1% for scalars: %2%") % attrName % name;
            params.dataCollector->readAttributeInfo(params.currentStep, name.c_str(), attrName.c_str()).read(attribute, sizeof(T_Attribute));
            log<picLog::INPUT_OUTPUT>("HDF5: attribute %1% = %2%") % attrName % *attribute;
        }
    }
 TaskFieldReceiveAndInsertExchange(Field &buffer, uint32_t exchange) :
 m_buffer(buffer),
 m_exchange(exchange),
 m_state(Constructor),
 initDependency(__getTransactionEvent())
 {
 }
    static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<simDim> globalCell)
    {

        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
        const DataSpace<simDim> globalTopLeft = subGrid.getLocalDomain().offset;
        const DataSpace<simDim> localSimulationArea = subGrid.getLocalDomain().size;
        DataSpace<simDim> localParCell = globalCell - globalTopLeft;


        for (int i = 0; i < (int) simDim; ++i)
        {
            //chek if particle is in the simulation area
            if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i])
                return;
        }

        //calculate supercell
        DataSpace<simDim> localSuperCell = (localParCell / MappingDesc::SuperCellSize::toRT());
        DataSpace<simDim> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::toRT());
        //add garding blocks to supercell
        localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells();


        __cudaKernel(kernelAddOneParticle)
            (1, 1)
            (parClass.getDeviceParticlesBox(),
             localSuperCell, cellInSuperCell);

        parClass.fillAllGaps();

        std::cout << "Wait for add particle" << std::endl;
        __getTransactionEvent().waitForFinished();
    }
    void oneStep(uint32_t currentStep, Buffer* read, Buffer* write)
    {
        PMACC_AUTO(splitEvent, __getTransactionEvent());
        /* GridBuffer 'read' will use 'splitEvent' to schedule transaction    *
         * tasks from the Guard of this local Area to the Borders of the      *
         * neighboring areas added by 'addExchange'. All transactions in      *
         * Transaction Manager will then be done in parallel to the           *
         * calculations in the core. In order to synchronize the data         *
         * transfer for the case the core calculation is finished earlier,    *
         * GridBuffer.asyncComm returns a transaction handle we can check     */
        PMACC_AUTO(send, read->asyncCommunication(splitEvent));
        evo.run<CORE>( read->getDeviceBuffer().getDataBox(),
                       write->getDeviceBuffer().getDataBox() );
        /* Join communication with worker tasks, Now all next tasks run sequential */
        __setTransactionEvent(send);
        /* Calculate Borders */
        evo.run<BORDER>( read->getDeviceBuffer().getDataBox(),
                         write->getDeviceBuffer().getDataBox() );
        write->deviceToHost();

        /* gather::operator() gathers all the buffers and assembles those to  *
         * a complete picture discarding the guards.                          */
        PMACC_AUTO(picture, gather(write->getHostBuffer().getDataBox()));
        PngCreator png;
        if (isMaster) png(currentStep, picture, gridSize);

    }
 TaskReceiveParticlesExchange(ParBase &parBase, uint32_t exchange) :
 parBase(parBase),
 exchange(exchange),
 state(Constructor),
 maxSize(parBase.getParticlesBuffer().getReceiveExchangeStack(exchange).getMaxParticlesCount()),
 initDependency(__getTransactionEvent()),
 lastSize(0) { }
 TaskFieldSendExchange(Field &buffer, uint32_t exchange) :
 buffer(buffer),
 exchange(exchange),
 state(Constructor),
 initDependency(__getTransactionEvent())
 {
 }
        /** Functor
         *
         * @param currentStep the current time step
         * @param speciesGroup naming for the group of species in T_SpeciesList
         */
        void operator()(
            uint32_t currentStep,
            std::string const & speciesGroup
        )
        {
            // generating a density requires at least one slot in FieldTmp
            PMACC_CASSERT_MSG(
                _please_allocate_at_least_one_FieldTmp_in_memory_param,
                fieldTmpNumSlots > 0
            );

            DataConnector &dc = Environment<>::get().DataConnector();

            // load FieldTmp without copy data to host and zero it
            auto fieldTmp = dc.get< FieldTmp >(
                FieldTmp::getUniqueId( 0 ),
                true
            );
            using DensityValueType = typename FieldTmp::ValueType;
            fieldTmp->getGridBuffer().getDeviceBuffer().setValue( DensityValueType::create(0.0) );

            // add density of each species in list to FieldTmp
            ForEach< SpeciesList, detail::AddSingleDensity< bmpl::_1 > > addSingleDensity;
            addSingleDensity( currentStep, forward( fieldTmp ) );

            /* create valid density in the BORDER region
             * note: for average != supercell multiples the GUARD of fieldTmp
             *       also needs to be filled in the communication above
             */
            EventTask fieldTmpEvent = fieldTmp->asyncCommunication(__getTransactionEvent());
            __setTransactionEvent(fieldTmpEvent);

            /* average summed density in FieldTmp down to local resolution and
             * write in new field
             */
            auto nlocal = dc.get< LocalDensity >(
                helperFields::LocalDensity::getName( speciesGroup ),
                true
            );
            constexpr uint32_t numWorkers = pmacc::traits::GetNumWorkers<
                pmacc::math::CT::volume< SuperCellSize >::type::value
            >::value;
            PMACC_KERNEL( helperFields::KernelAverageDensity< numWorkers >{ } )
            (
                // one block per averaged density value
                nlocal->getGridBuffer().getGridLayout().getDataSpaceWithoutGuarding(),
                numWorkers
            )
            (
                // start in border (jump over GUARD area)
                fieldTmp->getDeviceDataBox().shift( SuperCellSize::toRT() * GuardSize::toRT() ),
                // start in border (has no GUARD area)
                nlocal->getGridBuffer().getDeviceBuffer( ).getDataBox( )
            );

            // release fields
            dc.releaseData( FieldTmp::getUniqueId( 0 ) );
            dc.releaseData( helperFields::LocalDensity::getName( speciesGroup ) );
        }
Exemple #15
0
    /**
     * Resets all internal buffers.
     */
    void reset()
    {
        __startTransaction(__getTransactionEvent());
        frames->reset(false);
        frames->initialFillBuffer();
        EventTask ev1 = __endTransaction();
        __startTransaction(__getTransactionEvent());
        superCells->getDeviceBuffer().setValue(SuperCell<vint_t > ());
        superCells->getHostBuffer().setValue(SuperCell<vint_t > ());

        /*nextFrames->getDeviceBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX
        prevFrames->getDeviceBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX
        nextFrames->getHostBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX
        prevFrames->getHostBuffer().setValue(INV_IDX);//!\todo: is this needed? On device we set any new frame values to INVALID_INDEX
         */
        __setTransactionEvent(__endTransaction() + ev1);
    }
Exemple #16
0
 HINLINE void operator()(Functor, Type* dest, Type* src, const size_t count, MPI_Datatype type, MPI_Op op, MPI_Comm comm) const
 {
     // avoid deadlock between not finished pmacc tasks and mpi blocking collectives
     __getTransactionEvent().waitForFinished();
     MPI_CHECK(MPI_Allreduce((void*) src,
                             (void*) dest,
                             count,
                             type,
                             op, comm));
 }
Exemple #17
0
    /**
     * Starts copying data from device to host.
     */
    void deviceToHost()
    {

        __startTransaction(__getTransactionEvent());
        frames->deviceToHost();
        EventTask ev1 = __endTransaction();

        __startTransaction(__getTransactionEvent());
        superCells->deviceToHost();
        EventTask ev2 = __endTransaction();

        __startTransaction(__getTransactionEvent());
        nextFrames->deviceToHost();
        EventTask ev3 = __endTransaction();

        __startTransaction(__getTransactionEvent());
        prevFrames->deviceToHost();
        EventTask ev4 = __endTransaction();

        __setTransactionEvent(ev1 + ev2 + ev3 + ev4);
    }
Exemple #18
0
    void operator()(ThreadParams& params,
            const std::string& name, T_Scalar value,
            const std::string& attrName = "", T_Attribute attribute = T_Attribute())
    {
        log<picLog::INPUT_OUTPUT>("HDF5: write %1%D scalars: %2%") % simDim % name;

        // Size over all processes
        Dimensions globalSize(1, 1, 1);
        // Offset for this process
        Dimensions localOffset(0, 0, 0);
        // Offset for all processes
        Dimensions globalOffset(0, 0, 0);

        for (uint32_t d = 0; d < simDim; ++d)
        {
            globalSize[d] = Environment<simDim>::get().GridController().getGpuNodes()[d];
            localOffset[d] = Environment<simDim>::get().GridController().getPosition()[d];
        }

        Dimensions localSize(1, 1, 1);

        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
        __getTransactionEvent().waitForFinished();

        typename traits::PICToSplash<T_Scalar>::type splashType;
        params.dataCollector->writeDomain(params.currentStep,            /* id == time step */
                                           globalSize,                   /* total size of dataset over all processes */
                                           localOffset,                  /* write offset for this process */
                                           splashType,                   /* data type */
                                           simDim,                       /* NDims spatial dimensionality of the field */
                                           splash::Selection(localSize), /* data size of this process */
                                           name.c_str(),                 /* data set name */
                                           splash::Domain(
                                                  globalOffset,          /* offset of the global domain */
                                                  globalSize             /* size of the global domain */
                                           ),
                                           DomainCollector::GridType,
                                           &value);

        if(!attrName.empty())
        {
            /*simulation attribute for data*/
            typename traits::PICToSplash<T_Attribute>::type attType;

            log<picLog::INPUT_OUTPUT>("HDF5: write attribute %1% for scalars: %2%") % attrName % name;
            params.dataCollector->writeAttribute(params.currentStep,
                                                  attType, name.c_str(),
                                                  attrName.c_str(), &attribute);
        }
    }
        virtual void init()
        {
            state = Init;
            EventTask serialEvent = __getTransactionEvent();

            for (int i = 1; i < Exchanges; ++i)
            {
                if (buffer.getGridBuffer().hasSendExchange(i))
                {
                    __startAtomicTransaction(serialEvent);
                    FieldFactory::getInstance().createTaskFieldSendExchange(buffer, i);
                    tmpEvent += __endTransaction();
                }
            }
            state = WaitForSend;
        }
Exemple #20
0
    void shiftParticles()
    {
        StrideMapping<AREA, DIM3, MappingDesc> mapper(this->cellDescription);
        ParticlesBoxType pBox = particlesBuffer->getDeviceParticleBox();

        __startTransaction(__getTransactionEvent());
        do
        {
            __cudaKernel(kernelShiftParticles)
                (mapper.getGridDim(), TileSize)
                (pBox, mapper);
        }
        while (mapper.next());

        __setTransactionEvent(__endTransaction());

    }
Exemple #21
0
    /** Read the skalar field and optionally the attribute into the values referenced by the pointers */
    void operator()(ThreadParams& params,
                const std::string& name, T_Scalar* value,
                const std::string& attrName = "", T_Attribute* attribute = nullptr)
    {
        log<picLog::INPUT_OUTPUT> ("ADIOS: read %1%D scalars: %2%") % simDim % name;
        std::string datasetName = params.adiosBasePath + name;

        ADIOS_VARINFO* varInfo;
        ADIOS_CMD_EXPECT_NONNULL( varInfo = adios_inq_var(params.fp, datasetName.c_str()) );
        if(varInfo->ndim != simDim)
            throw std::runtime_error(std::string("Invalid dimensionality for ") + name);
        if(varInfo->type != traits::PICToAdios<T_Scalar>().type)
            throw std::runtime_error(std::string("Invalid type for ") + name);

        DataSpace<simDim> gridPos = Environment<simDim>::get().GridController().getPosition();
        uint64_t start[varInfo->ndim];
        uint64_t count[varInfo->ndim];
        for(int d = 0; d < varInfo->ndim; ++d)
        {
            /* \see adios_define_var: z,y,x in C-order */
            start[d] = gridPos.revert()[d];
            count[d] = 1;
        }

        ADIOS_SELECTION* fSel = adios_selection_boundingbox(varInfo->ndim, start, count);

        // avoid deadlock between not finished pmacc tasks and mpi calls in adios
        __getTransactionEvent().waitForFinished();

        /* specify what we want to read, but start reading at below at `adios_perform_reads` */
        /* magic parameters (0, 1): `from_step` (not used in streams), `nsteps` to read (must be 1 for stream) */
        log<picLog::INPUT_OUTPUT > ("ADIOS: Schedule read skalar %1%)") % datasetName;
        ADIOS_CMD( adios_schedule_read(params.fp, fSel, datasetName.c_str(), 0, 1, (void*)value) );

        /* start a blocking read of all scheduled variables */
        ADIOS_CMD( adios_perform_reads(params.fp, 1) );

        adios_selection_delete(fSel);
        adios_free_varinfo(varInfo);

        if(!attrName.empty())
        {
            log<picLog::INPUT_OUTPUT> ("ADIOS: read attribute %1% for scalars: %2%") % attrName % name;
            *attribute = readAttribute<T_Attribute>(params.fp, datasetName, attrName);
        }
    }
    void setCurrentSize(const size_t size)
    {
        // do host and device setCurrentSize parallel
        EventTask split = __getTransactionEvent();
        __startTransaction(split);
        stackIndexer.getHostBuffer().setCurrentSize(size);
        stack.getHostBuffer().setCurrentSize(size);
        EventTask e1 = __endTransaction();

        __startTransaction(split);
        stackIndexer.getDeviceBuffer().setCurrentSize(size);
        EventTask e2 = __endTransaction();
        __startTransaction(split);
        stack.getDeviceBuffer().setCurrentSize(size);
        EventTask e3 = __endTransaction();

        __setTransactionEvent(e1 + e2 + e3);
    }
    HINLINE void operator()(
                            T_StorageTuple& tuple,
                            const uint32_t currentStep,
                            const T_Event eventInt,
                            T_Event& updateEvent,
                            T_Event& commEvent
                            ) const
    {
        typedef typename HasFlag<FrameType, particlePusher<> >::type hasPusher;
        if (hasPusher::value)
        {
            PMACC_AUTO(speciesPtr, tuple[SpeciesName()]);

            __startTransaction(eventInt);
            speciesPtr->update(currentStep);
            commEvent += speciesPtr->asyncCommunication(__getTransactionEvent());
            updateEvent += __endTransaction();
        }
    }
Exemple #24
0
 /**
  * Starts sync data from own device buffer to neigbhor device buffer.
  *
  * Asynchronously starts syncronization data from internal DeviceBuffer using added
  * Exchange buffers.
  * This operation runs sequential to other code but intern asyncron
  *
  */
 EventTask communication()
 {
     EventTask ev = this->asyncCommunication(__getTransactionEvent());
     __setTransactionEvent(ev);
     return ev;
 }
Exemple #25
0
                HINLINE typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n)
                {
                   /* - the result of a functor can be a reference or a const value
                    * - it is not allowed to create const or reference memory
                    *   thus we remove `references` and `const` qualifiers */
                   typedef typename boost::remove_const<
                               typename boost::remove_reference<
                                   typename traits::GetValueType<Src>::ValueType
                               >::type
                           >::type Type;

                    uint32_t blockcount = optimalThreadsPerBlock(n, sizeof (Type));

                    uint32_t n_buffer = byte / sizeof (Type);

                    uint32_t threads = n_buffer * blockcount * 2; /* x2 is used thus we can use all byte in Buffer, after we calculate threads/2 */



                    if (threads > n) threads = n;
                    Type* dest = (Type*) reduceBuffer->getDeviceBuffer().getBasePointer();

                    uint32_t blocks = threads / 2 / blockcount;
                    if (blocks == 0) blocks = 1;
                    __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(src, n, dest, func,
                                                                                                            PMacc::nvidia::functors::Assign());
                    n = blocks;
                    blockcount = optimalThreadsPerBlock(n, sizeof (Type));
                    blocks = n / 2 / blockcount;
                    if (blocks == 0 && n > 1) blocks = 1;


                    while (blocks != 0)
                    {
                        if (blocks > 1)
                        {
                            uint32_t blockOffset = ceil((double) blocks / blockcount);
                            uint32_t useBlocks = blocks - blockOffset;
                            uint32_t problemSize = n - (blockOffset * blockcount);
                            Type* srcPtr = dest + (blockOffset * blockcount);

                            __cudaKernel((kernel::reduce < Type >))(useBlocks, blockcount, blockcount * sizeof (Type))(srcPtr, problemSize, dest, func, func);
                            blocks = blockOffset*blockcount;
                        }
                        else
                        {

                            __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(dest, n, dest, func,
                                                                                                                    PMacc::nvidia::functors::Assign());
                        }

                        n = blocks;
                        blockcount = optimalThreadsPerBlock(n, sizeof (Type));
                        blocks = n / 2 / blockcount;
                        if (blocks == 0 && n > 1) blocks = 1;
                    }

                    reduceBuffer->deviceToHost();
                    __getTransactionEvent().waitForFinished();
                    return *((Type*) (reduceBuffer->getHostBuffer().getBasePointer()));

                }