static void display_test_info() { printfQuda("running the following test:\n"); printfQuda("link_precision link_reconstruct space_dimension T_dimension algorithm max allowed error\n"); printfQuda("%s %s %d/%d/%d/ %d %s %g \n", get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, get_unitarization_str(reunit_svd_only), max_allowed_error); #ifdef MULTI_GPU printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", commDimPartitioned(0), commDimPartitioned(1), commDimPartitioned(2), commDimPartitioned(3)); #endif return ; }
void cudaColorSpinorField::allocateGhostBuffer(void) { int nFace = (nSpin == 1) ? 3 : 1; //3 faces for asqtad int Nint = nColor * nSpin * 2; // number of internal degrees of freedom if (nSpin == 4) Nint /= 2; // spin projection for Wilson if(this->initGhostFaceBuffer == 0 || precision > facePrecision){ for (int i=0; i<4; i++) { if(!commDimPartitioned(i)){ continue; } size_t faceBytes = nFace*ghostFace[i]*Nint*precision; // add extra space for the norms for half precision if (precision == QUDA_HALF_PRECISION) faceBytes += nFace*ghostFace[i]*sizeof(float); if (this->initGhostFaceBuffer) { // only free-ed if precision is higher than previous allocation cudaFree(this->fwdGhostFaceBuffer[i]); this->fwdGhostFaceBuffer[i] = NULL; cudaFree(this->backGhostFaceBuffer[i]); this->backGhostFaceBuffer[i] = NULL; } cudaMalloc((void**)&this->fwdGhostFaceBuffer[i], faceBytes); cudaMalloc((void**)&this->backGhostFaceBuffer[i], faceBytes); } CUERR; this->facePrecision = precision; this->initGhostFaceBuffer = 1; } }
void display_test_info() { printfQuda("running the following test:\n"); printfQuda("prec recon test_type dagger S_dim T_dimension\n"); printfQuda("%s %s %d %d %d/%d/%d %d \n", get_prec_str(prec), get_recon_str(link_recon), test_type, dagger, xdim, ydim, zdim, tdim); printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", commDimPartitioned(0), commDimPartitioned(1), commDimPartitioned(2), commDimPartitioned(3)); return ; }
void cudaColorSpinorField::freeGhostBuffer(void) { if (!initGhostFaceBuffer) return; for(int i=0;i < 4; i++){ if(!commDimPartitioned(i)){ continue; } cudaFree(fwdGhostFaceBuffer[i]); fwdGhostFaceBuffer[i] = NULL; cudaFree(backGhostFaceBuffer[i]); backGhostFaceBuffer[i] = NULL; } initGhostFaceBuffer = 0; }
// Find the best block size parameters for the Dslash and DslashXpay kernels void DiracDomainWall::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, const cudaColorSpinorField &x) { setDslashTuning(QUDA_TUNE_YES); { // Tune Dslash TuneDiracDomainWallDslash dslashTune(*this, out, in); dslashTune.Benchmark(tuneDslash[0]); for (int i=0; i<4; i++) if (commDimPartitioned(i)) dslashTune.Benchmark(tuneDslash[i+1]); } { // Tune DslashXpay TuneDiracDomainWallDslashXpay dslashXpayTune(*this, out, in, x); dslashXpayTune.Benchmark(tuneDslashXpay[0]); for (int i=0; i<4; i++) if (commDimPartitioned(i)) dslashXpayTune.Benchmark(tuneDslashXpay[i+1]); } setDslashTuning(QUDA_TUNE_NO); }
void ColorSpinorField::createGhostZone() { if (verbose == QUDA_DEBUG_VERBOSE) printfQuda("Precision = %d, Subset = %d\n", precision, siteSubset); int num_faces = 1; int num_norm_faces=2; if (nSpin == 1) { //staggered num_faces=6; num_norm_faces=6; } // calculate size of ghost zone required int ghostVolume = 0; //BEGIN NEW: //temporal hack int dims = nDim == 5 ? (nDim - 1) : nDim; int x5 = nDim == 5 ? x[4] : 1; ///includes DW ghosts for (int i=0; i<dims; i++) { ghostFace[i] = 0; if (commDimPartitioned(i)) { ghostFace[i] = 1; for (int j=0; j<dims; j++) { if (i==j) continue; ghostFace[i] *= x[j]; } ghostFace[i] *= x5; ///temporal hack : extra dimension for DW ghosts if (i==0 && siteSubset != QUDA_FULL_SITE_SUBSET) ghostFace[i] /= 2; if (siteSubset == QUDA_FULL_SITE_SUBSET) ghostFace[i] /= 2; ghostVolume += ghostFace[i]; } if(i==0){ ghostOffset[i] = 0; ghostNormOffset[i] = 0; }else{ ghostOffset[i] = ghostOffset[i-1] + num_faces*ghostFace[i-1]; ghostNormOffset[i] = ghostNormOffset[i-1] + num_norm_faces*ghostFace[i-1]; } #ifdef MULTI_GPU if (verbose == QUDA_DEBUG_VERBOSE) printfQuda("face %d = %6d commDimPartitioned = %6d ghostOffset = %6d ghostNormOffset = %6d\n", i, ghostFace[i], commDimPartitioned(i), ghostOffset[i], ghostNormOffset[i]); #endif }//end of outmost for loop //END NEW int ghostNormVolume = num_norm_faces * ghostVolume; ghostVolume *= num_faces; if (verbose == QUDA_DEBUG_VERBOSE) printfQuda("Allocated ghost volume = %d, ghost norm volume %d\n", ghostVolume, ghostNormVolume); // ghost zones are calculated on c/b volumes #ifdef MULTI_GPU ghost_length = ghostVolume*nColor*nSpin*2; ghost_norm_length = (precision == QUDA_HALF_PRECISION) ? ghostNormVolume : 0; #else ghost_length = 0; ghost_norm_length = 0; #endif if (siteSubset == QUDA_FULL_SITE_SUBSET) { total_length = length + 2*ghost_length; // 2 ghost zones in a full field total_norm_length = 2*(stride + ghost_norm_length); // norm length = 2*stride } else { total_length = length + ghost_length; total_norm_length = (precision == QUDA_HALF_PRECISION) ? stride + ghost_norm_length : 0; // norm length = stride } if (precision != QUDA_HALF_PRECISION) total_norm_length = 0; if (verbose == QUDA_DEBUG_VERBOSE) { printfQuda("ghost length = %d, ghost norm length = %d\n", ghost_length, ghost_norm_length); printfQuda("total length = %d, total norm length = %d\n", total_length, total_norm_length); } }
void llfat_cuda(FullGauge cudaFatLink, FullGauge cudaSiteLink, FullStaple cudaStaple, FullStaple cudaStaple1, QudaGaugeParam* param, double* act_path_coeff) { int volume = param->X[0]*param->X[1]*param->X[2]*param->X[3]; int Vh = volume/2; dim3 gridDim(volume/BLOCK_DIM,1,1); dim3 halfGridDim(Vh/BLOCK_DIM,1,1); dim3 blockDim(BLOCK_DIM , 1, 1); QudaPrecision prec = cudaSiteLink.precision; QudaReconstructType recon = cudaSiteLink.reconstruct; if( ((param->X[0] % 2 != 0) ||(param->X[1] % 2 != 0) ||(param->X[2] % 2 != 0) ||(param->X[3] % 2 != 0)) && (recon == QUDA_RECONSTRUCT_12)){ errorQuda("12 reconstruct and odd dimensionsize is not supported by link fattening code (yet)\n"); } int nStream=9; cudaStream_t stream[nStream]; for(int i = 0;i < nStream; i++){ cudaStreamCreate(&stream[i]); } llfatOneLinkKernel(cudaFatLink, cudaSiteLink,cudaStaple, cudaStaple1, param, act_path_coeff); CUERR; llfat_kernel_param_t kparam; for(int i=0;i < 4;i++){ kparam.ghostDim[i] = commDimPartitioned(i); } int ktype[8] = { LLFAT_EXTERIOR_KERNEL_BACK_X, LLFAT_EXTERIOR_KERNEL_FWD_X, LLFAT_EXTERIOR_KERNEL_BACK_Y, LLFAT_EXTERIOR_KERNEL_FWD_Y, LLFAT_EXTERIOR_KERNEL_BACK_Z, LLFAT_EXTERIOR_KERNEL_FWD_Z, LLFAT_EXTERIOR_KERNEL_BACK_T, LLFAT_EXTERIOR_KERNEL_FWD_T, }; for(int dir = 0;dir < 4; dir++){ for(int nu = 0; nu < 4; nu++){ if (nu != dir){ //start of one call for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; kparam.kernel_type = ktype[2*k]; siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, dir, nu, act_path_coeff[2], recon, prec, halfGridDim, kparam, &stream[2*k]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; kparam.kernel_type = ktype[2*k+1]; siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, dir, nu, act_path_coeff[2], recon, prec, halfGridDim, kparam, &stream[2*k+1]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } kparam.kernel_type = LLFAT_INTERIOR_KERNEL; siteComputeGenStapleParityKernel((void*)cudaStaple.even, (void*)cudaStaple.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, dir, nu, act_path_coeff[2], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_comms(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_wait(param->X, &cudaStaple, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; cudaStreamSynchronize(stream[2*k]); cudaStreamSynchronize(stream[2*k+1]); } //end //start of one call kparam.kernel_type = LLFAT_INTERIOR_KERNEL; computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, nu, 0, act_path_coeff[5], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; //end for(int rho = 0; rho < 4; rho++){ if (rho != dir && rho != nu){ //start of one call for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; kparam.kernel_type = ktype[2*k]; computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, rho, 1, act_path_coeff[3], recon, prec, halfGridDim, kparam, &stream[2*k]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; kparam.kernel_type = ktype[2*k+1]; computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, rho, 1, act_path_coeff[3], recon, prec, halfGridDim, kparam, &stream[2*k+1]); CUERR; exchange_gpu_staple_start(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } kparam.kernel_type = LLFAT_INTERIOR_KERNEL; computeGenStapleFieldParityKernel((void*)cudaStaple1.even, (void*)cudaStaple1.odd, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple.even, (void*)cudaStaple.odd, dir, rho, 1, act_path_coeff[3], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; #ifdef MULTI_GPU for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_comms(param->X, &cudaStaple1, k, (int)QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_BACKWARDS, &stream[2*k]); CUERR; exchange_gpu_staple_wait(param->X, &cudaStaple1, k, QUDA_FORWARDS, &stream[2*k+1]); CUERR; } for(int k=3; k >= 0 ;k--){ if(!commDimPartitioned(k)) continue; cudaStreamSynchronize(stream[2*k]); cudaStreamSynchronize(stream[2*k+1]); } #endif //end for(int sig = 0; sig < 4; sig++){ if (sig != dir && sig != nu && sig != rho){ //start of one call kparam.kernel_type = LLFAT_INTERIOR_KERNEL; computeGenStapleFieldParityKernel((void*)NULL, (void*)NULL, (void*)cudaSiteLink.even, (void*)cudaSiteLink.odd, (void*)cudaFatLink.even, (void*)cudaFatLink.odd, (void*)cudaStaple1.even, (void*)cudaStaple1.odd, dir, sig, 0, act_path_coeff[4], recon, prec, halfGridDim, kparam, &stream[nStream-1]); CUERR; //end } }//sig } }//rho } }//nu }//dir cudaThreadSynchronize(); checkCudaError(); for(int i=0;i < nStream; i++){ cudaStreamDestroy(stream[i]); } return; }