__global__ void kernelParticleDensity(ParBox pb, DataBox<PitchedBox<Type_, DIM2> > image, DataSpace<DIM2> transpose, int slice, uint32_t globalOffset, uint32_t sliceDim, Mapping mapper) { typedef typename ParBox::FrameType FRAME; typedef typename MappingDesc::SuperCellSize Block; __shared__ FRAME *frame; __shared__ bool isValid; __syncthreads(); /*wait that all shared memory is initialised*/ bool isImageThread = false; const DataSpace<simDim> threadId(threadIdx); const DataSpace<DIM2> localCell(threadId[transpose.x()], threadId[transpose.y()]); const DataSpace<simDim> block = mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx)); const DataSpace<simDim> blockOffset((block - 1) * Block::getDataSpace()); int localId = threadIdx.z * Block::x * Block::y + threadIdx.y * Block::x + threadIdx.x; if (localId == 0) isValid = false; __syncthreads(); //\todo: guard size should not be set to (fixed) 1 here const DataSpace<simDim> realCell(blockOffset + threadId); //delete guard from cell idx uint32_t globalCell = realCell[sliceDim] + globalOffset; if (globalCell == slice) { isValid = true; isImageThread = true; } __syncthreads(); if (!isValid) return; /*index in image*/ DataSpace<DIM2> imageCell( realCell[transpose.x()], realCell[transpose.y()]); // counter is always DIM2 typedef DataBox < PitchedBox< float_X, DIM2 > > SharedMem; extern __shared__ float_X shBlock[]; __syncthreads(); /*wait that all shared memory is initialised*/ const DataSpace<simDim> blockSize(blockDim); SharedMem counter(PitchedBox<float_X, DIM2 > ((float_X*) shBlock, DataSpace<DIM2 > (), blockSize[transpose.x()] * sizeof (float_X))); if (isImageThread) { counter(localCell) = float_X(0.0); } if (localId == 0) { frame = &(pb.getFirstFrame(block, isValid)); } __syncthreads(); while (isValid) //move over all Frames { PMACC_AUTO(particle,(*frame)[localId]); if (particle[multiMask_] == 1) { int cellIdx = particle[localCellIdx_]; // we only draw the first slice of cells in the super cell (z == 0) const DataSpace<DIM3> particleCellId(DataSpaceOperations<DIM3>::template map<Block > (cellIdx)); uint32_t globalParticleCell = particleCellId[sliceDim] + globalOffset + blockOffset[sliceDim]; if (globalParticleCell == slice) { const DataSpace<DIM2> reducedCell(particleCellId[transpose.x()], particleCellId[transpose.y()]); atomicAddWrapper(&(counter(reducedCell)), particle[weighting_] / NUM_EL_PER_PARTICLE); } } __syncthreads(); if (localId == 0) { frame = &(pb.getNextFrame(*frame, isValid)); } __syncthreads(); } if (isImageThread) { image(imageCell) = (Type_) counter(localCell); } }
__global__ void kernelPaintParticles3D(ParBox pb, DataBox<PitchedBox<float3_X, DIM2> > image, DataSpace<DIM2> transpose, int slice, uint32_t globalOffset, uint32_t sliceDim, Mapping mapper) { typedef typename ParBox::FrameType FRAME; typedef typename MappingDesc::SuperCellSize Block; __shared__ FRAME *frame; __shared__ bool isValid; __syncthreads(); /*wait that all shared memory is initialised*/ bool isImageThread = false; const DataSpace<simDim> threadId(threadIdx); const DataSpace<DIM2> localCell(threadId[transpose.x()], threadId[transpose.y()]); const DataSpace<simDim> block = mapper.getSuperCellIndex(DataSpace<simDim > (blockIdx)); const DataSpace<simDim> blockOffset((block - 1) * Block::getDataSpace()); int localId = threadIdx.z * Block::x * Block::y + threadIdx.y * Block::x + threadIdx.x; if (localId == 0) isValid = false; __syncthreads(); //\todo: guard size should not be set to (fixed) 1 here const DataSpace<simDim> realCell(blockOffset + threadId); //delete guard from cell idx #if(SIMDIM==DIM3) uint32_t globalCell = realCell[sliceDim] + globalOffset; if (globalCell == slice) #endif { atomicExch((int*) &isValid, 1); /*WAW Error in cuda-memcheck racecheck*/ isImageThread = true; } __syncthreads(); if (!isValid) return; /*index in image*/ DataSpace<DIM2> imageCell( realCell[transpose.x()], realCell[transpose.y()]); // counter is always DIM2 typedef DataBox < PitchedBox< float_X, DIM2 > > SharedMem; extern __shared__ float_X shBlock[]; __syncthreads(); /*wait that all shared memory is initialised*/ const DataSpace<simDim> blockSize(blockDim); SharedMem counter(PitchedBox<float_X, DIM2 > ((float_X*) shBlock, DataSpace<DIM2 > (), blockSize[transpose.x()] * sizeof (float_X))); if (isImageThread) { counter(localCell) = float_X(0.0); } if (localId == 0) { frame = &(pb.getFirstFrame(block, isValid)); } __syncthreads(); while (isValid) //move over all Frames { PMACC_AUTO(particle,(*frame)[localId]); if (particle[multiMask_] == 1) { int cellIdx = particle[localCellIdx_]; // we only draw the first slice of cells in the super cell (z == 0) const DataSpace<simDim> particleCellId(DataSpaceOperations<simDim>::template map<Block > (cellIdx)); #if(SIMDIM==DIM3) uint32_t globalParticleCell = particleCellId[sliceDim] + globalOffset + blockOffset[sliceDim]; if (globalParticleCell == slice) #endif { const DataSpace<DIM2> reducedCell(particleCellId[transpose.x()], particleCellId[transpose.y()]); atomicAddWrapper(&(counter(reducedCell)), particle[weighting_] / NUM_EL_PER_PARTICLE); } } __syncthreads(); if (localId == 0) { frame = &(pb.getNextFrame(*frame, isValid)); } __syncthreads(); } if (isImageThread) { /** Note: normally, we would multiply by NUM_EL_PER_PARTICLE again. * BUT: since we are interested in a simple value between 0 and 1, * we stay with this number (normalized to the order of macro * particles) and devide by the number of typical macro particles * per cell */ float_X value = counter(localCell) / float_X(particleInit::NUM_PARTICLES_PER_CELL); // * NUM_EL_PER_PARTICLE; if (value > 1.0) value = 1.0; //image(imageCell).x() = value; visPreview::preParticleDensCol::addRGB(image(imageCell), value, visPreview::preParticleDens_opacity); // cut to [0, 1] if (image(imageCell).x() < float_X(0.0)) image(imageCell).x() = float_X(0.0); if (image(imageCell).x() > float_X(1.0)) image(imageCell).x() = float_X(1.0); if (image(imageCell).y() < float_X(0.0)) image(imageCell).y() = float_X(0.0); if (image(imageCell).y() > float_X(1.0)) image(imageCell).y() = float_X(1.0); if (image(imageCell).z() < float_X(0.0)) image(imageCell).z() = float_X(0.0); if (image(imageCell).z() > float_X(1.0)) image(imageCell).z() = float_X(1.0); } }