コード例 #1
0
static void
display_test_info()
{
  printfQuda("running the following test:\n");
    
  printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       algorithm     max allowed error\n");
  printfQuda("%s                       %s                         %d/%d/%d/                  %d               %s             %g \n", 
	     get_prec_str(prec),
	     get_recon_str(link_recon), 
	     xdim, ydim, zdim, tdim, 
	     get_unitarization_str(reunit_svd_only),
	     max_allowed_error);

#ifdef MULTI_GPU
  printfQuda("Grid partition info:     X  Y  Z  T\n");
  printfQuda("                         %d  %d  %d  %d\n",
             commDimPartitioned(0),
             commDimPartitioned(1),
             commDimPartitioned(2),
             commDimPartitioned(3));
#endif

  return ;
  
}
コード例 #2
0
void cudaColorSpinorField::allocateGhostBuffer(void) {
  int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad
  int Nint = nColor * nSpin * 2; // number of internal degrees of freedom
  if (nSpin == 4) Nint /= 2; // spin projection for Wilson

  if(this->initGhostFaceBuffer == 0 || precision > facePrecision){    
    for (int i=0; i<4; i++) {
      if(!commDimPartitioned(i)){
	continue;
      }
      size_t faceBytes = nFace*ghostFace[i]*Nint*precision;
      // add extra space for the norms for half precision
      if (precision == QUDA_HALF_PRECISION) faceBytes += nFace*ghostFace[i]*sizeof(float);
      
      if (this->initGhostFaceBuffer) { // only free-ed if precision is higher than previous allocation
	cudaFree(this->fwdGhostFaceBuffer[i]); this->fwdGhostFaceBuffer[i] = NULL;
	cudaFree(this->backGhostFaceBuffer[i]); this->backGhostFaceBuffer[i] = NULL;
      }
      cudaMalloc((void**)&this->fwdGhostFaceBuffer[i], faceBytes);
      cudaMalloc((void**)&this->backGhostFaceBuffer[i], faceBytes);
    }   
    CUERR;
    
    this->facePrecision = precision;
    this->initGhostFaceBuffer = 1;
  }
}
コード例 #3
0
void display_test_info()
{
  printfQuda("running the following test:\n");
 
  printfQuda("prec recon   test_type     dagger   S_dim         T_dimension\n");
  printfQuda("%s   %s       %d           %d       %d/%d/%d        %d \n", 
	     get_prec_str(prec), get_recon_str(link_recon), 
	     test_type, dagger, xdim, ydim, zdim, tdim);
  printfQuda("Grid partition info:     X  Y  Z  T\n"); 
  printfQuda("                         %d  %d  %d  %d\n", 
	     commDimPartitioned(0),
	     commDimPartitioned(1),
	     commDimPartitioned(2),
	     commDimPartitioned(3));

  return ;
    
}
コード例 #4
0
void cudaColorSpinorField::freeGhostBuffer(void) {
  if (!initGhostFaceBuffer) return;
  
  for(int i=0;i < 4; i++){
    if(!commDimPartitioned(i)){
      continue;
    }
    cudaFree(fwdGhostFaceBuffer[i]); fwdGhostFaceBuffer[i] = NULL;
    cudaFree(backGhostFaceBuffer[i]); backGhostFaceBuffer[i] = NULL;
  } 

  initGhostFaceBuffer = 0;  
}
コード例 #5
0
ファイル: dirac_domain_wall.cpp プロジェクト: alexstrel/quda
// Find the best block size parameters for the Dslash and DslashXpay kernels
void DiracDomainWall::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, 
			   const cudaColorSpinorField &x) {

  setDslashTuning(QUDA_TUNE_YES);

  { // Tune Dslash
    TuneDiracDomainWallDslash dslashTune(*this, out, in);
    dslashTune.Benchmark(tuneDslash[0]);
    for (int i=0; i<4; i++) 
      if (commDimPartitioned(i)) 
	dslashTune.Benchmark(tuneDslash[i+1]);
  }

  { // Tune DslashXpay
    TuneDiracDomainWallDslashXpay dslashXpayTune(*this, out, in, x);
    dslashXpayTune.Benchmark(tuneDslashXpay[0]);
    for (int i=0; i<4; i++) 
      if (commDimPartitioned(i)) 
	dslashXpayTune.Benchmark(tuneDslashXpay[i+1]);
  }

  setDslashTuning(QUDA_TUNE_NO);
}
コード例 #6
0
ファイル: color_spinor_field.cpp プロジェクト: knippsch/quda
  void ColorSpinorField::createGhostZone() {

    if (verbose == QUDA_DEBUG_VERBOSE) 
      printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset);

    int num_faces = 1;
    int num_norm_faces=2;
    if (nSpin == 1) { //staggered
      num_faces=6;
      num_norm_faces=6;
    }

    // calculate size of ghost zone required
    int ghostVolume = 0;
    //BEGIN NEW:  
    //temporal hack
    int dims = nDim == 5 ? (nDim - 1) : nDim;
    int x5   = nDim == 5 ? x[4] : 1; ///includes DW ghosts
    for (int i=0; i<dims; i++) {
      ghostFace[i] = 0;
      if (commDimPartitioned(i)) {
	ghostFace[i] = 1;
	for (int j=0; j<dims; j++) {
	  if (i==j) continue;
	  ghostFace[i] *= x[j];
	}
	ghostFace[i] *= x5; ///temporal hack : extra dimension for DW ghosts
	if (i==0 && siteSubset != QUDA_FULL_SITE_SUBSET) ghostFace[i] /= 2;
	if (siteSubset == QUDA_FULL_SITE_SUBSET) ghostFace[i] /= 2;
	ghostVolume += ghostFace[i];
      }
      if(i==0){
	ghostOffset[i] = 0;
	ghostNormOffset[i] = 0;
      }else{
	ghostOffset[i] = ghostOffset[i-1] + num_faces*ghostFace[i-1];
	ghostNormOffset[i] = ghostNormOffset[i-1] + num_norm_faces*ghostFace[i-1];
      }

#ifdef MULTI_GPU
      if (verbose == QUDA_DEBUG_VERBOSE) 
	printfQuda("face %d = %6d commDimPartitioned = %6d ghostOffset = %6d ghostNormOffset = %6d\n", 
		   i, ghostFace[i], commDimPartitioned(i), ghostOffset[i], ghostNormOffset[i]);
#endif
    }//end of outmost for loop
    //END NEW  
    int ghostNormVolume = num_norm_faces * ghostVolume;
    ghostVolume *= num_faces;

    if (verbose == QUDA_DEBUG_VERBOSE) 
      printfQuda("Allocated ghost volume = %d, ghost norm volume %d\n", ghostVolume, ghostNormVolume);

    // ghost zones are calculated on c/b volumes
#ifdef MULTI_GPU
    ghost_length = ghostVolume*nColor*nSpin*2; 
    ghost_norm_length = (precision == QUDA_HALF_PRECISION) ? ghostNormVolume : 0;
#else
    ghost_length = 0;
    ghost_norm_length = 0;
#endif

    if (siteSubset == QUDA_FULL_SITE_SUBSET) {
      total_length = length + 2*ghost_length; // 2 ghost zones in a full field
      total_norm_length = 2*(stride + ghost_norm_length); // norm length = 2*stride
    } else {
      total_length = length + ghost_length;
      total_norm_length = (precision == QUDA_HALF_PRECISION) ? stride + ghost_norm_length : 0; // norm length = stride
    }

    if (precision != QUDA_HALF_PRECISION) total_norm_length = 0;

    if (verbose == QUDA_DEBUG_VERBOSE) {
      printfQuda("ghost length = %d, ghost norm length = %d\n", ghost_length, ghost_norm_length);
      printfQuda("total length = %d, total norm length = %d\n", total_length, total_norm_length);
    }
  }
コード例 #7
0
ファイル: llfat_quda_itf.cpp プロジェクト: fwinter/quda
void
llfat_cuda(FullGauge cudaFatLink, FullGauge cudaSiteLink, 
	   FullStaple cudaStaple, FullStaple cudaStaple1,
	   QudaGaugeParam* param, double* act_path_coeff)
{
  int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3];
  int Vh = volume/2;
  dim3 gridDim(volume/BLOCK_DIM,1,1);
  dim3 halfGridDim(Vh/BLOCK_DIM,1,1);
  dim3 blockDim(BLOCK_DIM , 1, 1);
  
  QudaPrecision prec = cudaSiteLink.precision;
  QudaReconstructType recon = cudaSiteLink.reconstruct;
  
  if( ((param->X[0] % 2 != 0)
       ||(param->X[1] % 2 != 0)
       ||(param->X[2] % 2 != 0)
       ||(param->X[3] % 2 != 0))
      && (recon  == QUDA_RECONSTRUCT_12)){
    errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n");
    
  }
      
  int nStream=9;
  cudaStream_t stream[nStream];
  for(int i = 0;i < nStream; i++){
    cudaStreamCreate(&stream[i]);
  }

  
  llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1,
		     param, act_path_coeff); CUERR;
  
  llfat_kernel_param_t kparam;
  for(int i=0;i < 4;i++){
     kparam.ghostDim[i] = commDimPartitioned(i);
  }
  int ktype[8] = {
		LLFAT_EXTERIOR_KERNEL_BACK_X, 
		LLFAT_EXTERIOR_KERNEL_FWD_X, 
		LLFAT_EXTERIOR_KERNEL_BACK_Y, 
		LLFAT_EXTERIOR_KERNEL_FWD_Y, 
		LLFAT_EXTERIOR_KERNEL_BACK_Z, 
		LLFAT_EXTERIOR_KERNEL_FWD_Z, 
		LLFAT_EXTERIOR_KERNEL_BACK_T, 
		LLFAT_EXTERIOR_KERNEL_FWD_T, 
  };


  for(int dir = 0;dir < 4; dir++){
    for(int nu = 0; nu < 4; nu++){
      if (nu != dir){

	//start of one call
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  
	  kparam.kernel_type = ktype[2*k];
	  siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
					   (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					   (void*)cudaFatLink.even, (void*)cudaFatLink.odd,
					   dir, nu,
					   act_path_coeff[2],
					   recon, prec, halfGridDim,
					   kparam, &stream[2*k]); CUERR;	  
	  
	  exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]);  CUERR;
	  
	  kparam.kernel_type = ktype[2*k+1];
	  siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
					   (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					   (void*)cudaFatLink.even, (void*)cudaFatLink.odd,
					   dir, nu,
					   act_path_coeff[2],
					   recon, prec, halfGridDim,
					   kparam, &stream[2*k+1]); CUERR;
	  exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]);  CUERR;
	}
        kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
	siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd,
					 (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					 (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
					 dir, nu,
					 act_path_coeff[2],
					 recon, prec, halfGridDim, 
					 kparam, &stream[nStream-1]); CUERR;
	
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR;
	  exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	}	
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR;
	  exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	}
 	for(int k=3; k >= 0 ;k--){
	  if(!commDimPartitioned(k)) continue;
	  cudaStreamSynchronize(stream[2*k]);
	  cudaStreamSynchronize(stream[2*k+1]);
	}	
	//end

	//start of one call
        kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
	computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL,
					  (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					  (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
					  (void*)cudaStaple.even, (void*)cudaStaple.odd,
					  dir, nu, 0,
					  act_path_coeff[5],
					  recon, prec,  halfGridDim, kparam, &stream[nStream-1]); CUERR;
	//end
	for(int rho = 0; rho < 4; rho++){
	  if (rho != dir && rho != nu){

	    //start of one call
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      kparam.kernel_type = ktype[2*k];	    
	      computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
						(void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
						(void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
						(void*)cudaStaple.even, (void*)cudaStaple.odd,
						dir, rho, 1,
						act_path_coeff[3],
						recon, prec, halfGridDim, kparam, &stream[2*k]); CUERR;	      
	      exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]);  CUERR;
	      kparam.kernel_type = ktype[2*k+1];	    
	      computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
						(void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
						(void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
						(void*)cudaStaple.even, (void*)cudaStaple.odd,
						dir, rho, 1,
						act_path_coeff[3],
						recon, prec, halfGridDim, kparam, &stream[2*k+1]); CUERR;
	      exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]);  CUERR;
	    }	    

	    kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
	    computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd,
					      (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
					      (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
					      (void*)cudaStaple.even, (void*)cudaStaple.odd,
					      dir, rho, 1,
					      act_path_coeff[3],
					      recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR;

#ifdef MULTI_GPU
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR;
	      exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	    }
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]); CUERR;
	      exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]); CUERR;
	    }
	    for(int k=3; k >= 0 ;k--){
	      if(!commDimPartitioned(k)) continue;
	      cudaStreamSynchronize(stream[2*k]);
	      cudaStreamSynchronize(stream[2*k+1]);
	    }	
#endif	    
	    //end

	    
	    for(int sig = 0; sig < 4; sig++){
	      if (sig != dir && sig != nu && sig != rho){						
		
		//start of one call
		kparam.kernel_type = LLFAT_INTERIOR_KERNEL;
		computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, 
						  (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd,
						  (void*)cudaFatLink.even, (void*)cudaFatLink.odd, 
						  (void*)cudaStaple1.even, (void*)cudaStaple1.odd,
						  dir, sig, 0,
						  act_path_coeff[4],
						  recon, prec, halfGridDim, kparam, &stream[nStream-1]);	 CUERR;

		//end
		
	      }			    
	    }//sig
	  }
	}//rho	
      }
    }//nu
  }//dir
  
  
  cudaThreadSynchronize(); 
  checkCudaError();
  
  for(int i=0;i < nStream; i++){
    cudaStreamDestroy(stream[i]);
  }

  return;
}