예제 #1
0
DataSource::DataSource( const livre::DataSourcePluginData& pluginData )
    : _impl( new DataSource::Impl( pluginData ))
{
    const float resolution = _impl->params.getResolution();
    const size_t maxBlockByteSize = _impl->params.getMaxBlockSize( );

    FunctorPtr functor = _impl->source->getFunctor();
    ::fivox::EventSourcePtr loader = functor->getSource();

    const ::fivox::AABBf& bbox = loader->getBoundingBox();
    uint32_t depth = 0;
    const Vector3f fullResolution =
        ( bbox.getSize() + loader->getCutOffDistance() * 2.0f ) * resolution;
    Vector3f blockResolution = fullResolution;

    // maxTextureSize value should be retrieved from OpenGL. But at this
    // point in time there may be no GL context. So a general object is
    // needed in Livre to query the OpenGL device properties.
    const size_t maxTextureSize = 2048;
    while( true )
    {
        if( blockResolution.product() < maxBlockByteSize &&
            blockResolution.x() < maxTextureSize &&
            blockResolution.y() < maxTextureSize &&
            blockResolution.z() < maxTextureSize )
        {
            break;
        }
        blockResolution = blockResolution / 2.0f;
        ++depth;
    }

    vmml::Vector3ui blockDim( std::ceil( blockResolution.x( )),
                              std::ceil( blockResolution.y( )),
                              std::ceil( blockResolution.z( )));
    if( blockDim.x() > 8 )
        blockDim.x() -= (blockDim.x() % 8);
    if( blockDim.y() > 8 )
        blockDim.y() -= (blockDim.y() % 8);
    if( blockDim.z() > 8 )
        blockDim.z() -= (blockDim.z() % 8);

    const size_t treeQuotient = 1 << depth;
    const vmml::Vector3ui totalTreeSize = blockDim * treeQuotient;
    _impl->_borders = vmml::Vector3f( totalTreeSize ) / resolution -
                      bbox.getSize();

    _volumeInfo.voxels = totalTreeSize;
    _volumeInfo.maximumBlockSize = blockDim;

    if( !livre::fillRegularVolumeInfo( _volumeInfo ))
        LBTHROW( std::runtime_error( "Cannot setup the regular tree" ));

    // SDK uses microns, volume information uses meters
    _volumeInfo.boundingBox = ::fivox::AABBf( bbox.getMin() / 1000000.f,
                                              bbox.getMax() / 1000000.f );
}
예제 #2
0
{
    unsigned long start;
    unsigned long stop;
    unsigned long currentBlockDim;
    unsigned long currentThreadId;
    unsigned long currentBlockId;
    
    start = clockCounter();
    syncThreads();    
    

    currentBlockDim = blockDim();
    currentThreadId = blockThreadId();
    currentBlockId = blockId();
        
    ON_BASIC_BLOCK_EXIT:
    {
            unsigned long offset = basicBlockCount() * basicBlockId() + currentBlockDim * currentBlockId + currentThreadId;
            globalMem[offset] = globalMem[offset] + basicBlockExecutedInstructionCount();
    }

    
   
    ON_KERNEL_EXIT:
    {
        syncThreads();
        stop = clockCounter();
        if (threadIndexX() == 0) {
            unsigned long total = basicBlockCount() * gridDim() * currentBlockDim;
            globalMem[currentBlockId * 2 + total] = stop - start;
            globalMem[currentBlockId * 2 + 1 + total] = smId();
예제 #3
0
DataSource::DataSource( const livre::DataSourcePluginData& pluginData )
    : _impl( new DataSource::Impl( pluginData ))
{
    // We assume that the data's units are micrometers
    _volumeInfo.meterToDataUnitRatio = 1e6;

    _volumeInfo.description = _impl->params.getDescription();

    const AABBf& bbox = _impl->source->getBoundingBox();
    const Vector3f resolution = _impl->source->getResolution();
    const Vector3f fullResolution =
            _impl->source->getSizeInMicrometer() * resolution;

    // maxTextureSize value should be retrieved from OpenGL. But at this
    // point in time there may be no GL context. So a general object is
    // needed in Livre to query the OpenGL device properties.
    const size_t maxTextureSize = 2048;
    const size_t maxBlockByteSize = _impl->params.getMaxBlockSize();
    Vector3f blockResolution = fullResolution;
    size_t depth = 0;
    while( true )
    {
        if( blockResolution.product() < maxBlockByteSize &&
            blockResolution.x() < maxTextureSize &&
            blockResolution.y() < maxTextureSize &&
            blockResolution.z() < maxTextureSize )
        {
            break;
        }
        blockResolution = blockResolution / 2.0f;
        ++depth;
    }

    vmml::Vector3ui blockDim( std::ceil( blockResolution.x( )),
                              std::ceil( blockResolution.y( )),
                              std::ceil( blockResolution.z( )));
    if( blockDim.x() > 8 )
        blockDim.x() -= blockDim.x() % 8;
    if( blockDim.y() > 8 )
        blockDim.y() -= blockDim.y() % 8;
    if( blockDim.z() > 8 )
        blockDim.z() -= blockDim.z() % 8;

    const size_t treeQuotient = 1 << depth;
    const vmml::Vector3ui totalTreeSize = blockDim * treeQuotient;
    _impl->_borders = vmml::Vector3f( totalTreeSize ) / resolution -
                      bbox.getSize();

    _volumeInfo.voxels = totalTreeSize;
    _volumeInfo.maximumBlockSize = blockDim;

    if( !livre::fillRegularVolumeInfo( _volumeInfo ))
        LBTHROW( std::runtime_error( "Cannot setup the regular tree" ));

    const float maxDim = std::max( _impl->_borders.x() + bbox.getSize().x(),
                         std::max( _impl->_borders.y() + bbox.getSize().y(),
                                   _impl->_borders.z() + bbox.getSize().z( )));
    const Vector3f scale( 1.0f / maxDim );
    vmml::Matrix4f& transform = _volumeInfo.dataToLivreTransform;
    transform.setTranslation( -bbox.getCenter( ));
    transform.scale( scale );
    transform.scaleTranslation( scale );
    _volumeInfo.resolution = resolution;
}
예제 #4
0
{   
    unsigned long warpId = (blockId() * blockDim() + blockThreadId()) >> 5;

    ON_INSTRUCTION:
    BRANCH:
    {
        
        if(leastActiveThreadInWarp())
        {
            globalMem[warpId * 2] = globalMem[warpId * 2] + divergentWarp();
            globalMem[warpId * 2 + 1] = globalMem[warpId * 2 + 1] + 1;   
        }
    }
}
예제 #5
0
void
llfat_cuda(FullGauge cudaFatLink, FullGauge cudaSiteLink, 
	   FullStaple cudaStaple, FullStaple cudaStaple1,
	   QudaGaugeParam* param, double* act_path_coeff)
{
  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
  int Vh = volume/2;
  dim3 gridDim(volume/BLOCK_DIM,1,1);
  dim3 halfGridDim(Vh/BLOCK_DIM,1,1);
  dim3 blockDim(BLOCK_DIM , 1, 1);
  
  QudaPrecision prec = cudaSiteLink.precision;
  QudaReconstructType recon = cudaSiteLink.reconstruct;
  
  if( ((param->X[0] % 2 != 0)
       ||(param->X[1] % 2 != 0)
       ||(param->X[2] % 2 != 0)
       ||(param->X[3] % 2 != 0))
      && (recon  == QUDA_RECONSTRUCT_12)){
    errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
    
  }
      
  int nStream=9;
  cudaStream_t stream[nStream];
  for(int i = 0;i < nStream; i++){
    cudaStreamCreate(&stream[i]);
  }

  
  llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1,
		     param, act_path_coeff); CUERR;
  
  llfat_kernel_param_t kparam;
  for(int i=0;i < 4;i++){
     kparam.ghostDim[i] = commDimPartitioned(i);
  }
  int ktype[8] = {
		LLFAT_EXTERIOR_KERNEL_BACK_X, 
		LLFAT_EXTERIOR_KERNEL_FWD_X, 
		LLFAT_EXTERIOR_KERNEL_BACK_Y, 
		LLFAT_EXTERIOR_KERNEL_FWD_Y, 
		LLFAT_EXTERIOR_KERNEL_BACK_Z, 
		LLFAT_EXTERIOR_KERNEL_FWD_Z, 
		LLFAT_EXTERIOR_KERNEL_BACK_T, 
		LLFAT_EXTERIOR_KERNEL_FWD_T, 
  };


  for(int dir = 0;dir < 4; dir++){
    for(int nu = 0; nu < 4; nu++){
      if (nu != dir){

	//start of one call
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  
	  kparam.kernel_type = ktype[2*k];
	  siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
					   (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					   (void*)cudaFatLink.even, (void*)cudaFatLink.odd,
					   dir, nu,
					   act_path_coeff[2],
					   recon, prec, halfGridDim,
					   kparam, &stream[2*k]); CUERR;	  
	  
	  exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);  CUERR;
	  
	  kparam.kernel_type = ktype[2*k+1];
	  siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
					   (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					   (void*)cudaFatLink.even, (void*)cudaFatLink.odd,
					   dir, nu,
					   act_path_coeff[2],
					   recon, prec, halfGridDim,
					   kparam, &stream[2*k+1]); CUERR;
	  exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);  CUERR;
	}
        kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
	siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
					 (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					 (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
					 dir, nu,
					 act_path_coeff[2],
					 recon, prec, halfGridDim, 
					 kparam, &stream[nStream-1]); CUERR;
	
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR;
	  exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	}	
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR;
	  exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	}
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  cudaStreamSynchronize(stream[2*k]);
	  cudaStreamSynchronize(stream[2*k+1]);
	}	
	//end

	//start of one call
        kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
	computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL,
					  (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					  (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
					  (void*)cudaStaple.even, (void*)cudaStaple.odd,
					  dir, nu, 0,
					  act_path_coeff[5],
					  recon, prec,  halfGridDim, kparam, &stream[nStream-1]); CUERR;
	//end
	for(int rho = 0; rho < 4; rho++){
	  if (rho != dir && rho != nu){

	    //start of one call
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      kparam.kernel_type = ktype[2*k];	    
	      computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
						(void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
						(void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
						(void*)cudaStaple.even, (void*)cudaStaple.odd,
						dir, rho, 1,
						act_path_coeff[3],
						recon, prec, halfGridDim, kparam, &stream[2*k]); CUERR;	      
	      exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]);  CUERR;
	      kparam.kernel_type = ktype[2*k+1];	    
	      computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
						(void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
						(void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
						(void*)cudaStaple.even, (void*)cudaStaple.odd,
						dir, rho, 1,
						act_path_coeff[3],
						recon, prec, halfGridDim, kparam, &stream[2*k+1]); CUERR;
	      exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]);  CUERR;
	    }	    

	    kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
	    computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
					      (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					      (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
					      (void*)cudaStaple.even, (void*)cudaStaple.odd,
					      dir, rho, 1,
					      act_path_coeff[3],
					      recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR;

#ifdef MULTI_GPU
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR;
	      exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	    }
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]); CUERR;
	      exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	    }
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      cudaStreamSynchronize(stream[2*k]);
	      cudaStreamSynchronize(stream[2*k+1]);
	    }	
#endif	    
	    //end

	    
	    for(int sig = 0; sig < 4; sig++){
	      if (sig != dir && sig != nu && sig != rho){						
		
		//start of one call
		kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
		computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, 
						  (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
						  (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
						  (void*)cudaStaple1.even, (void*)cudaStaple1.odd,
						  dir, sig, 0,
						  act_path_coeff[4],
						  recon, prec, halfGridDim, kparam, &stream[nStream-1]);	 CUERR;

		//end
		
	      }			    
	    }//sig
	  }
	}//rho	
      }
    }//nu
  }//dir
  
  
  cudaThreadSynchronize(); 
  checkCudaError();
  
  for(int i=0;i < nStream; i++){
    cudaStreamDestroy(stream[i]);
  }

  return;
}
예제 #6
0
{
    unsigned long threadId = blockThreadId();
    unsigned long warpId = (blockId() * blockDim() + threadId) >> 5;
    
    ON_INSTRUCTION:
    MEM_READ:
    MEM_WRITE:
    GLOBAL:
    {
        sharedMem[threadId] = computeBaseAddress();

        if(leastActiveThreadInWarp())
        {
            unsigned long uniqueCount = uniqueElementCount(sharedMem, 1);
            globalMem[warpId * 2] = globalMem[warpId * 2] + uniqueCount;
            globalMem[warpId * 2 + 1] = globalMem[warpId * 2 + 1] + 1;
        }
    }    
}