C++ (Cpp) __cudaKernel 예제들

예제 #1

0

파일 보기

파일: ParticlesBase.hpp 프로젝트: CodeLemon/picongpu

    void fillGaps()
    {
        AreaMapping<AREA, MappingDesc> mapper(this->cellDescription);

        __cudaKernel(kernelFillGaps)
            (mapper.getGridDim(), TileSize)
            (particlesBuffer->getDeviceParticleBox(), mapper);

        __cudaKernel(kernelFillGapsLastFrame)
            (mapper.getGridDim(), TileSize)
            (particlesBuffer->getDeviceParticleBox(), mapper);
    }

예제 #2

0

파일 보기

파일: ParticlesInitOneParticle.hpp 프로젝트: AK9527lq/picongpu

    static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<simDim> globalCell)
    {

        const SubGrid<simDim>& subGrid = Environment<simDim>::get().SubGrid();
        const DataSpace<simDim> globalTopLeft = subGrid.getLocalDomain().offset;
        const DataSpace<simDim> localSimulationArea = subGrid.getLocalDomain().size;
        DataSpace<simDim> localParCell = globalCell - globalTopLeft;


        for (int i = 0; i < (int) simDim; ++i)
        {
            //chek if particle is in the simulation area
            if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i])
                return;
        }

        //calculate supercell
        DataSpace<simDim> localSuperCell = (localParCell / MappingDesc::SuperCellSize::toRT());
        DataSpace<simDim> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::toRT());
        //add garding blocks to supercell
        localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells();


        __cudaKernel(kernelAddOneParticle)
            (1, 1)
            (parClass.getDeviceParticlesBox(),
             localSuperCell, cellInSuperCell);

        parClass.fillAllGaps();

        std::cout << "Wait for add particle" << std::endl;
        __getTransactionEvent().waitForFinished();
    }

예제 #3

0

파일 보기

파일: ParticlesInitOneParticle.hpp 프로젝트: CodeLemon/picongpu

    static void addOneParticle(ParticlesClass& parClass, MappingDesc cellDescription, DataSpace<DIM3> globalCell)
    {

        PMACC_AUTO(simBox, SubGrid<simDim>::getInstance().getSimulationBox());
        const DataSpace<DIM3> globalTopLeft = simBox.getGlobalOffset();
        const DataSpace<DIM3> localSimulationArea = simBox.getLocalSize();
        DataSpace<DIM3> localParCell = globalCell - globalTopLeft;


        for (int i = 0; i < (int) DIM3; ++i)
        {
            //chek if particle is in the simulation area
            if (localParCell[i] < 0 || localParCell[i] >= localSimulationArea[i])
                return;
        }

        //calculate supercell 
        DataSpace<DIM3> localSuperCell = (localParCell / MappingDesc::SuperCellSize::getDataSpace());
        DataSpace<DIM3> cellInSuperCell = localParCell - (localSuperCell * MappingDesc::SuperCellSize::getDataSpace());
        //add garding blocks to supercell 
        localSuperCell = localSuperCell + cellDescription.getGuardingSuperCells();


        __cudaKernel(kernelAddOneParticle)
            (1, 1)
            (parClass.getDeviceParticlesBox(),
             localSuperCell, cellInSuperCell);

        parClass.fillAllGaps();

        std::cout << "Wait for add particle" << std::endl;
        __getTransactionEvent().waitForFinished();
    }

예제 #4

0

파일 보기

파일: ParticlesBase.hpp 프로젝트: CodeLemon/picongpu

    void shiftParticles()
    {
        StrideMapping<AREA, DIM3, MappingDesc> mapper(this->cellDescription);
        ParticlesBoxType pBox = particlesBuffer->getDeviceParticleBox();

        __startTransaction(__getTransactionEvent());
        do
        {
            __cudaKernel(kernelShiftParticles)
                (mapper.getGridDim(), TileSize)
                (pBox, mapper);
        }
        while (mapper.next());

        __setTransactionEvent(__endTransaction());

    }

예제 #5

0

파일 보기

파일: CountParticles.hpp 프로젝트: Sanjay-Kamalapuri/picongpu

    static uint64_cu countOnDevice(PBuffer& buffer, CellDesc cellDescription, Filter filter)
    {
        GridBuffer<uint64_cu, DIM1> counter(DataSpace<DIM1>(1));

        dim3 block(CellDesc::SuperCellSize::toRT().toDim3());

        AreaMapping<AREA, CellDesc> mapper(cellDescription);

        __cudaKernel(kernelCountParticles)
            (mapper.getGridDim(), block)
            (buffer.getDeviceParticlesBox(),
             counter.getDeviceBuffer().getBasePointer(),
             filter,
             mapper);

        counter.deviceToHost();
        return *(counter.getHostBuffer().getDataBox());
    }

예제 #6

0

파일 보기

                HINLINE typename traits::GetValueType<Src>::ValueType operator()(Functor func, Src src, uint32_t n)
                {
                   /* - the result of a functor can be a reference or a const value
                    * - it is not allowed to create const or reference memory
                    *   thus we remove `references` and `const` qualifiers */
                   typedef typename boost::remove_const<
                               typename boost::remove_reference<
                                   typename traits::GetValueType<Src>::ValueType
                               >::type
                           >::type Type;

                    uint32_t blockcount = optimalThreadsPerBlock(n, sizeof (Type));

                    uint32_t n_buffer = byte / sizeof (Type);

                    uint32_t threads = n_buffer * blockcount * 2; /* x2 is used thus we can use all byte in Buffer, after we calculate threads/2 */



                    if (threads > n) threads = n;
                    Type* dest = (Type*) reduceBuffer->getDeviceBuffer().getBasePointer();

                    uint32_t blocks = threads / 2 / blockcount;
                    if (blocks == 0) blocks = 1;
                    __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(src, n, dest, func,
                                                                                                            PMacc::nvidia::functors::Assign());
                    n = blocks;
                    blockcount = optimalThreadsPerBlock(n, sizeof (Type));
                    blocks = n / 2 / blockcount;
                    if (blocks == 0 && n > 1) blocks = 1;


                    while (blocks != 0)
                    {
                        if (blocks > 1)
                        {
                            uint32_t blockOffset = ceil((double) blocks / blockcount);
                            uint32_t useBlocks = blocks - blockOffset;
                            uint32_t problemSize = n - (blockOffset * blockcount);
                            Type* srcPtr = dest + (blockOffset * blockcount);

                            __cudaKernel((kernel::reduce < Type >))(useBlocks, blockcount, blockcount * sizeof (Type))(srcPtr, problemSize, dest, func, func);
                            blocks = blockOffset*blockcount;
                        }
                        else
                        {

                            __cudaKernel((kernel::reduce < Type >))(blocks, blockcount, blockcount * sizeof (Type))(dest, n, dest, func,
                                                                                                                    PMacc::nvidia::functors::Assign());
                        }

                        n = blocks;
                        blockcount = optimalThreadsPerBlock(n, sizeof (Type));
                        blocks = n / 2 / blockcount;
                        if (blocks == 0 && n > 1) blocks = 1;
                    }

                    reduceBuffer->deviceToHost();
                    __getTransactionEvent().waitForFinished();
                    return *((Type*) (reduceBuffer->getHostBuffer().getBasePointer()));

                }