__global__ void
kernelParticleDensity(ParBox pb,
                      DataBox<PitchedBox<Type_, DIM2> > image,
                      DataSpace<DIM2> transpose,
                      int slice,
                      uint32_t globalOffset,
                      uint32_t sliceDim,
                      Mapping mapper)
{
    typedef typename ParBox::FrameType FRAME;
    typedef typename MappingDesc::SuperCellSize Block;
    __shared__ FRAME *frame;
    __shared__ bool isValid;
    __syncthreads(); /*wait that all shared memory is initialised*/

    bool isImageThread = false;

    const DataSpace<simDim> threadId(threadIdx);
    const DataSpace<DIM2> localCell(threadId[transpose.x()], threadId[transpose.y()]);
    const DataSpace<simDim> block = mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx));
    const DataSpace<simDim> blockOffset((block - 1) * Block::getDataSpace());


    int localId = threadIdx.z * Block::x * Block::y + threadIdx.y * Block::x + threadIdx.x;


    if (localId == 0)
        isValid = false;
    __syncthreads();

    //\todo: guard size should not be set to (fixed) 1 here
    const DataSpace<simDim> realCell(blockOffset + threadId); //delete guard from cell idx


    uint32_t globalCell = realCell[sliceDim] + globalOffset;

    if (globalCell == slice)
    {
        isValid = true;
        isImageThread = true;
    }
    __syncthreads();

    if (!isValid)
        return;

    /*index in image*/
    DataSpace<DIM2> imageCell(
                              realCell[transpose.x()],
                              realCell[transpose.y()]);


    // counter is always DIM2
    typedef DataBox < PitchedBox< float_X, DIM2 > > SharedMem;
    extern __shared__ float_X shBlock[];
    __syncthreads(); /*wait that all shared memory is initialised*/

    const DataSpace<simDim> blockSize(blockDim);
    SharedMem counter(PitchedBox<float_X, DIM2 > ((float_X*) shBlock,
                                              DataSpace<DIM2 > (),
                                              blockSize[transpose.x()] * sizeof (float_X)));

    if (isImageThread)
    {
        counter(localCell) = float_X(0.0);
    }


    if (localId == 0)
    {
        frame = &(pb.getFirstFrame(block, isValid));
    }
    __syncthreads();

    while (isValid) //move over all Frames
    {
        PMACC_AUTO(particle,(*frame)[localId]);
        if (particle[multiMask_] == 1)
        {
            int cellIdx = particle[localCellIdx_];
            // we only draw the first slice of cells in the super cell (z == 0)
            const DataSpace<DIM3> particleCellId(DataSpaceOperations<DIM3>::template map<Block > (cellIdx));
            uint32_t globalParticleCell = particleCellId[sliceDim] + globalOffset + blockOffset[sliceDim];
            if (globalParticleCell == slice)
            {
                const DataSpace<DIM2> reducedCell(particleCellId[transpose.x()], particleCellId[transpose.y()]);
                atomicAddWrapper(&(counter(reducedCell)), particle[weighting_] / NUM_EL_PER_PARTICLE);
            }
        }
        __syncthreads();

        if (localId == 0)
        {
            frame = &(pb.getNextFrame(*frame, isValid));
        }
        __syncthreads();
    }


    if (isImageThread)
    {
        image(imageCell) = (Type_) counter(localCell);
    }
}
Example #2
0
__global__ void
kernelPaintParticles3D(ParBox pb,
                       DataBox<PitchedBox<float3_X, DIM2> > image,
                       DataSpace<DIM2> transpose,
                       int slice,
                       uint32_t globalOffset,
                       uint32_t sliceDim,
                       Mapping mapper)
{
    typedef typename ParBox::FrameType FRAME;
    typedef typename MappingDesc::SuperCellSize Block;
    __shared__ FRAME *frame;
    __shared__ bool isValid;
    __syncthreads(); /*wait that all shared memory is initialised*/
    bool isImageThread = false;

    const DataSpace<simDim> threadId(threadIdx);
    const DataSpace<DIM2> localCell(threadId[transpose.x()], threadId[transpose.y()]);
    const DataSpace<simDim> block = mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx));
    const DataSpace<simDim> blockOffset((block - 1) * Block::getDataSpace());


    int localId = threadIdx.z * Block::x * Block::y + threadIdx.y * Block::x + threadIdx.x;


    if (localId == 0)
        isValid = false;
    __syncthreads();

    //\todo: guard size should not be set to (fixed) 1 here
    const DataSpace<simDim> realCell(blockOffset + threadId); //delete guard from cell idx

#if(SIMDIM==DIM3)
    uint32_t globalCell = realCell[sliceDim] + globalOffset;

    if (globalCell == slice)
#endif
    {
        atomicExch((int*) &isValid, 1); /*WAW Error in cuda-memcheck racecheck*/
        isImageThread = true;
    }
    __syncthreads();

    if (!isValid)
        return;

    /*index in image*/
    DataSpace<DIM2> imageCell(
                              realCell[transpose.x()],
                              realCell[transpose.y()]);


    // counter is always DIM2
    typedef DataBox < PitchedBox< float_X, DIM2 > > SharedMem;
    extern __shared__ float_X shBlock[];
    __syncthreads(); /*wait that all shared memory is initialised*/

    const DataSpace<simDim> blockSize(blockDim);
    SharedMem counter(PitchedBox<float_X, DIM2 > ((float_X*) shBlock,
                                                  DataSpace<DIM2 > (),
                                                  blockSize[transpose.x()] * sizeof (float_X)));

    if (isImageThread)
    {
        counter(localCell) = float_X(0.0);
    }


    if (localId == 0)
    {
        frame = &(pb.getFirstFrame(block, isValid));
    }
    __syncthreads();

    while (isValid) //move over all Frames
    {
        PMACC_AUTO(particle,(*frame)[localId]);
        if (particle[multiMask_] == 1)
        {
            int cellIdx = particle[localCellIdx_];
            // we only draw the first slice of cells in the super cell (z == 0)
            const DataSpace<simDim> particleCellId(DataSpaceOperations<simDim>::template map<Block > (cellIdx));
#if(SIMDIM==DIM3)
            uint32_t globalParticleCell = particleCellId[sliceDim] + globalOffset + blockOffset[sliceDim];
            if (globalParticleCell == slice)
#endif
            {
                const DataSpace<DIM2> reducedCell(particleCellId[transpose.x()], particleCellId[transpose.y()]);
                atomicAddWrapper(&(counter(reducedCell)), particle[weighting_] / NUM_EL_PER_PARTICLE);
            }
        }
        __syncthreads();

        if (localId == 0)
        {
            frame = &(pb.getNextFrame(*frame, isValid));
        }
        __syncthreads();
    }


    if (isImageThread)
    {
        /** Note: normally, we would multiply by NUM_EL_PER_PARTICLE again.
         *  BUT: since we are interested in a simple value between 0 and 1,
         *       we stay with this number (normalized to the order of macro
         *       particles) and devide by the number of typical macro particles
         *       per cell
         */
        float_X value = counter(localCell)
            / float_X(particleInit::NUM_PARTICLES_PER_CELL); // * NUM_EL_PER_PARTICLE;
        if (value > 1.0) value = 1.0;

        //image(imageCell).x() = value;
        visPreview::preParticleDensCol::addRGB(image(imageCell),
                                               value,
                                               visPreview::preParticleDens_opacity);

        // cut to [0, 1]
        if (image(imageCell).x() < float_X(0.0)) image(imageCell).x() = float_X(0.0);
        if (image(imageCell).x() > float_X(1.0)) image(imageCell).x() = float_X(1.0);
        if (image(imageCell).y() < float_X(0.0)) image(imageCell).y() = float_X(0.0);
        if (image(imageCell).y() > float_X(1.0)) image(imageCell).y() = float_X(1.0);
        if (image(imageCell).z() < float_X(0.0)) image(imageCell).z() = float_X(0.0);
        if (image(imageCell).z() > float_X(1.0)) image(imageCell).z() = float_X(1.0);
    }
}