DataSource::DataSource( const livre::DataSourcePluginData& pluginData ) : _impl( new DataSource::Impl( pluginData )) { const float resolution = _impl->params.getResolution(); const size_t maxBlockByteSize = _impl->params.getMaxBlockSize( ); FunctorPtr functor = _impl->source->getFunctor(); ::fivox::EventSourcePtr loader = functor->getSource(); const ::fivox::AABBf& bbox = loader->getBoundingBox(); uint32_t depth = 0; const Vector3f fullResolution = ( bbox.getSize() + loader->getCutOffDistance() * 2.0f ) * resolution; Vector3f blockResolution = fullResolution; // maxTextureSize value should be retrieved from OpenGL. But at this // point in time there may be no GL context. So a general object is // needed in Livre to query the OpenGL device properties. const size_t maxTextureSize = 2048; while( true ) { if( blockResolution.product() < maxBlockByteSize && blockResolution.x() < maxTextureSize && blockResolution.y() < maxTextureSize && blockResolution.z() < maxTextureSize ) { break; } blockResolution = blockResolution / 2.0f; ++depth; } vmml::Vector3ui blockDim( std::ceil( blockResolution.x( )), std::ceil( blockResolution.y( )), std::ceil( blockResolution.z( ))); if( blockDim.x() > 8 ) blockDim.x() -= (blockDim.x() % 8); if( blockDim.y() > 8 ) blockDim.y() -= (blockDim.y() % 8); if( blockDim.z() > 8 ) blockDim.z() -= (blockDim.z() % 8); const size_t treeQuotient = 1 << depth; const vmml::Vector3ui totalTreeSize = blockDim * treeQuotient; _impl->_borders = vmml::Vector3f( totalTreeSize ) / resolution - bbox.getSize(); _volumeInfo.voxels = totalTreeSize; _volumeInfo.maximumBlockSize = blockDim; if( !livre::fillRegularVolumeInfo( _volumeInfo )) LBTHROW( std::runtime_error( "Cannot setup the regular tree" )); // SDK uses microns, volume information uses meters _volumeInfo.boundingBox = ::fivox::AABBf( bbox.getMin() / 1000000.f, bbox.getMax() / 1000000.f ); }
{ unsigned long start; unsigned long stop; unsigned long currentBlockDim; unsigned long currentThreadId; unsigned long currentBlockId; start = clockCounter(); syncThreads(); currentBlockDim = blockDim(); currentThreadId = blockThreadId(); currentBlockId = blockId(); ON_BASIC_BLOCK_EXIT: { unsigned long offset = basicBlockCount() * basicBlockId() + currentBlockDim * currentBlockId + currentThreadId; globalMem[offset] = globalMem[offset] + basicBlockExecutedInstructionCount(); } ON_KERNEL_EXIT: { syncThreads(); stop = clockCounter(); if (threadIndexX() == 0) { unsigned long total = basicBlockCount() * gridDim() * currentBlockDim; globalMem[currentBlockId * 2 + total] = stop - start; globalMem[currentBlockId * 2 + 1 + total] = smId();
DataSource::DataSource( const livre::DataSourcePluginData& pluginData ) : _impl( new DataSource::Impl( pluginData )) { // We assume that the data's units are micrometers _volumeInfo.meterToDataUnitRatio = 1e6; _volumeInfo.description = _impl->params.getDescription(); const AABBf& bbox = _impl->source->getBoundingBox(); const Vector3f resolution = _impl->source->getResolution(); const Vector3f fullResolution = _impl->source->getSizeInMicrometer() * resolution; // maxTextureSize value should be retrieved from OpenGL. But at this // point in time there may be no GL context. So a general object is // needed in Livre to query the OpenGL device properties. const size_t maxTextureSize = 2048; const size_t maxBlockByteSize = _impl->params.getMaxBlockSize(); Vector3f blockResolution = fullResolution; size_t depth = 0; while( true ) { if( blockResolution.product() < maxBlockByteSize && blockResolution.x() < maxTextureSize && blockResolution.y() < maxTextureSize && blockResolution.z() < maxTextureSize ) { break; } blockResolution = blockResolution / 2.0f; ++depth; } vmml::Vector3ui blockDim( std::ceil( blockResolution.x( )), std::ceil( blockResolution.y( )), std::ceil( blockResolution.z( ))); if( blockDim.x() > 8 ) blockDim.x() -= blockDim.x() % 8; if( blockDim.y() > 8 ) blockDim.y() -= blockDim.y() % 8; if( blockDim.z() > 8 ) blockDim.z() -= blockDim.z() % 8; const size_t treeQuotient = 1 << depth; const vmml::Vector3ui totalTreeSize = blockDim * treeQuotient; _impl->_borders = vmml::Vector3f( totalTreeSize ) / resolution - bbox.getSize(); _volumeInfo.voxels = totalTreeSize; _volumeInfo.maximumBlockSize = blockDim; if( !livre::fillRegularVolumeInfo( _volumeInfo )) LBTHROW( std::runtime_error( "Cannot setup the regular tree" )); const float maxDim = std::max( _impl->_borders.x() + bbox.getSize().x(), std::max( _impl->_borders.y() + bbox.getSize().y(), _impl->_borders.z() + bbox.getSize().z( ))); const Vector3f scale( 1.0f / maxDim ); vmml::Matrix4f& transform = _volumeInfo.dataToLivreTransform; transform.setTranslation( -bbox.getCenter( )); transform.scale( scale ); transform.scaleTranslation( scale ); _volumeInfo.resolution = resolution; }
{ unsigned long warpId = (blockId() * blockDim() + blockThreadId()) >> 5; ON_INSTRUCTION: BRANCH: { if(leastActiveThreadInWarp()) { globalMem[warpId * 2] = globalMem[warpId * 2] + divergentWarp(); globalMem[warpId * 2 + 1] = globalMem[warpId * 2 + 1] + 1; } } }
void llfat_cuda(FullGauge cudaFatLink, FullGauge cudaSiteLink, FullStaple cudaStaple, FullStaple cudaStaple1, QudaGaugeParam* param, double* act_path_coeff) { int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; int Vh = volume/2; dim3 gridDim(volume/BLOCK_DIM,1,1); dim3 halfGridDim(Vh/BLOCK_DIM,1,1); dim3 blockDim(BLOCK_DIM , 1, 1); QudaPrecision prec = cudaSiteLink.precision; QudaReconstructType recon = cudaSiteLink.reconstruct; if( ((param->X[0] % 2 != 0) ||(param->X[1] % 2 != 0) ||(param->X[2] % 2 != 0) ||(param->X[3] % 2 != 0)) && (recon == QUDA_RECONSTRUCT_12)){ errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n"); } int nStream=9; cudaStream_t stream[nStream]; for(int i = 0;i < nStream; i++){ cudaStreamCreate(&stream[i]); } llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1, param, act_path_coeff); CUERR; llfat_kernel_param_t kparam; for(int i=0;i < 4;i++){ kparam.ghostDim[i] = commDimPartitioned(i); } int ktype[8] = { LLFAT_EXTERIOR_KERNEL_BACK_X, LLFAT_EXTERIOR_KERNEL_FWD_X, LLFAT_EXTERIOR_KERNEL_BACK_Y, LLFAT_EXTERIOR_KERNEL_FWD_Y, LLFAT_EXTERIOR_KERNEL_BACK_Z, LLFAT_EXTERIOR_KERNEL_FWD_Z, LLFAT_EXTERIOR_KERNEL_BACK_T, LLFAT_EXTERIOR_KERNEL_FWD_T, }; for(int dir = 0;dir < 4; dir++){ for(int nu = 0; nu < 4; nu++){ if (nu != dir){ //start of one call for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; kparam.kernel_type = ktype[2*k]; siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, dir, nu, act_path_coeff[2], recon, prec, halfGridDim, kparam, &stream[2*k]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; kparam.kernel_type = ktype[2*k+1]; siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, dir, nu, act_path_coeff[2], recon, prec, halfGridDim, kparam, &stream[2*k+1]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } kparam.kernel_type = LLFAT_INTERIOR_KERNEL; siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, dir, nu, act_path_coeff[2], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; cudaStreamSynchronize(stream[2*k]); cudaStreamSynchronize(stream[2*k+1]); } //end //start of one call kparam.kernel_type = LLFAT_INTERIOR_KERNEL; computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, nu, 0, act_path_coeff[5], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; //end for(int rho = 0; rho < 4; rho++){ if (rho != dir && rho != nu){ //start of one call for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; kparam.kernel_type = ktype[2*k]; computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, rho, 1, act_path_coeff[3], recon, prec, halfGridDim, kparam, &stream[2*k]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; kparam.kernel_type = ktype[2*k+1]; computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, rho, 1, act_path_coeff[3], recon, prec, halfGridDim, kparam, &stream[2*k+1]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } kparam.kernel_type = LLFAT_INTERIOR_KERNEL; computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, rho, 1, act_path_coeff[3], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; #ifdef MULTI_GPU for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; cudaStreamSynchronize(stream[2*k]); cudaStreamSynchronize(stream[2*k+1]); } #endif //end for(int sig = 0; sig < 4; sig++){ if (sig != dir && sig != nu && sig != rho){ //start of one call kparam.kernel_type = LLFAT_INTERIOR_KERNEL; computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple1.even, (void*)cudaStaple1.odd, dir, sig, 0, act_path_coeff[4], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; //end } }//sig } }//rho } }//nu }//dir cudaThreadSynchronize(); checkCudaError(); for(int i=0;i < nStream; i++){ cudaStreamDestroy(stream[i]); } return; }
{ unsigned long threadId = blockThreadId(); unsigned long warpId = (blockId() * blockDim() + threadId) >> 5; ON_INSTRUCTION: MEM_READ: MEM_WRITE: GLOBAL: { sharedMem[threadId] = computeBaseAddress(); if(leastActiveThreadInWarp()) { unsigned long uniqueCount = uniqueElementCount(sharedMem, 1); globalMem[warpId * 2] = globalMem[warpId * 2] + uniqueCount; globalMem[warpId * 2 + 1] = globalMem[warpId * 2 + 1] + 1; } } }