static void gauge_force_test(void) { int max_length = 6; initQuda(device); setVerbosityQuda(QUDA_VERBOSE,"",stdout); qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.cpu_prec = link_prec; qudaGaugeParam.cuda_prec = link_prec; qudaGaugeParam.cuda_prec_sloppy = link_prec; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.reconstruct_sloppy = link_recon; qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; qudaGaugeParam.ga_pad = 0; qudaGaugeParam.mom_ga_pad = 0; size_t gSize = qudaGaugeParam.cpu_prec; void* sitelink; void* sitelink_1d; #ifdef GPU_DIRECT sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize); #else sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize); #endif // this is a hack to have site link generated in 2d // then copied to 1d array in "MILC" format void* sitelink_2d[4]; #ifdef GPU_DIRECT for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #else for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #endif // fills the gauge field with random numbers createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0); //copy the 2d sitelink to 1d milc format for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V; i++){ char* src = ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } if (qudaGaugeParam.gauge_order == QUDA_MILC_GAUGE_ORDER){ sitelink = sitelink_1d; }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) { sitelink = (void**)sitelink_2d; } else { errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order); } #ifdef MULTI_GPU void* sitelink_ex_2d[4]; void* sitelink_ex_1d; sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize); for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize); int X1= Z[0]; int X2= Z[1]; int X3= Z[2]; int X4= Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ continue; } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink_2d[dir]; char* dst = (char*)sitelink_ex_2d[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V_ex; i++){ char* src = ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } #endif void* mom = safe_malloc(4*V*momSiteSize*gSize); void* refmom = safe_malloc(4*V*momSiteSize*gSize); memset(mom, 0, 4*V*momSiteSize*gSize); //initialize some data in cpuMom createMomCPU(mom, qudaGaugeParam.cpu_prec); memcpy(refmom, mom, 4*V*momSiteSize*gSize); double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)]; for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){ loop_coeff_d[i] = loop_coeff_f[i]; } void* loop_coeff; if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){ loop_coeff = (void*)&loop_coeff_f[0]; }else{ loop_coeff = loop_coeff_d; } double eb3 = 0.3; int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]); int** input_path_buf[4]; for(int dir =0; dir < 4; dir++){ input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*)); for(int i=0;i < num_paths;i++){ input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int)); if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int)); else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int)); else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int)); else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int)); } } if (tune) { printfQuda("Tuning...\n"); setTuning(QUDA_TUNE_YES); } struct timeval t0, t1; double timeinfo[3]; /* Multiple execution to exclude warmup time in the first run*/ for (int i =0;i < attempts; i++){ gettimeofday(&t0, NULL); computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3, &qudaGaugeParam, timeinfo); gettimeofday(&t1, NULL); } double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec); //The number comes from CPU implementation in MILC, gauge_force_imp.c int flops=153004; if (verify_results){ for(int i = 0;i < attempts;i++){ #ifdef MULTI_GPU //last arg=0 means no optimization for communication, i.e. exchange data in all directions //even they are not partitioned int R[4] = {2, 2, 2, 2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #else gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #endif } int res; res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec); strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec); printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); } double perf = 1.0* flops*V/(total_time*1e+9); double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9); printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f total time =%.2f ms\n", timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3); printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf); for(int dir = 0; dir < 4; dir++){ for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]); host_free(input_path_buf[dir]); } host_free(sitelink_1d); for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]); #ifdef MULTI_GPU host_free(sitelink_ex_1d); for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]); #endif host_free(mom); host_free(refmom); endQuda(); }
void _initQuda() { if( quda_initialized ) return; if( g_debug_level > 0 ) if(g_proc_id == 0) printf("\n# QUDA: Detected QUDA version %d.%d.%d\n\n", QUDA_VERSION_MAJOR, QUDA_VERSION_MINOR, QUDA_VERSION_SUBMINOR); if( QUDA_VERSION_MAJOR == 0 && QUDA_VERSION_MINOR < 7) { fprintf(stderr, "Error: minimum QUDA version required is 0.7.0 (for support of chiral basis and removal of bug in mass normalization with preconditioning).\n"); exit(-2); } gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); // *** QUDA parameters begin here (sloppy prec. will be adjusted in invert) QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION; QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION; QudaTune tune = QUDA_TUNE_YES; // *** the remainder should not be changed for this application // local lattice size #if USE_LZ_LY_LX_T gauge_param.X[0] = LZ; gauge_param.X[1] = LY; gauge_param.X[2] = LX; gauge_param.X[3] = T; #else gauge_param.X[0] = LX; gauge_param.X[1] = LY; gauge_param.X[2] = LZ; gauge_param.X[3] = T; #endif inv_param.Ls = 1; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = 18; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = 18; gauge_param.cuda_prec_precondition = cuda_prec_precondition; gauge_param.reconstruct_precondition = 18; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; inv_param.pipeline = 0; inv_param.gcrNkrylov = 10; // require both L2 relative and heavy quark residual to determine convergence // inv_param.residual_type = (QudaResidualType)(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL); inv_param.tol_hq = 1.0;//1e-3; // specify a tolerance for the residual for heavy quark residual inv_param.reliable_delta = 1e-2; // ignored by multi-shift solver // domain decomposition preconditioner parameters inv_param.inv_type_precondition = QUDA_CG_INVERTER; inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; inv_param.precondition_cycle = 1; inv_param.tol_precondition = 1e-1; inv_param.maxiter_precondition = 10; inv_param.verbosity_precondition = QUDA_SILENT; inv_param.cuda_prec_precondition = cuda_prec_precondition; inv_param.omega = 1.0; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; inv_param.gamma_basis = QUDA_CHIRAL_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; gauge_param.ga_pad = 0; // 24*24*24/2; inv_param.sp_pad = 0; // 24*24*24/2; inv_param.cl_pad = 0; // 24*24*24/2; // For multi-GPU, ga_pad must be large enough to store a time-slice int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; // solver verbosity if( g_debug_level == 0 ) inv_param.verbosity = QUDA_SILENT; else if( g_debug_level == 1 ) inv_param.verbosity = QUDA_SUMMARIZE; else inv_param.verbosity = QUDA_VERBOSE; // general verbosity setVerbosityQuda( QUDA_SUMMARIZE, "# QUDA: ", stdout); // declare the grid mapping used for communications in a multi-GPU grid #if USE_LZ_LY_LX_T int grid[4] = {g_nproc_z, g_nproc_y, g_nproc_x, g_nproc_t}; #else int grid[4] = {g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t}; #endif initCommsGridQuda(4, grid, commsMap, NULL); // alloc gauge_quda size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { gauge_quda[dir] = (double*) malloc(VOLUME*18*gSize); if(gauge_quda[dir] == NULL) { fprintf(stderr, "_initQuda: malloc for gauge_quda[dir] failed"); exit(-2); } } // alloc space for a temp. spinor, used throughout this module tempSpinor = (double*)malloc( 2*VOLUME*24*sizeof(double) ); /* factor 2 for doublet */ if(tempSpinor == NULL) { fprintf(stderr, "_initQuda: malloc for tempSpinor failed"); exit(-2); } // initialize the QUDA library #ifdef MPI initQuda(-1); //sets device numbers automatically #else initQuda(0); //scalar build: use device 0 #endif quda_initialized = 1; }