int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setSpinorSiteSize(6); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize); qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize); } fatlink = malloc(4*V*gaugeSiteSize*gSize); longlink = malloc(4*V*gaugeSiteSize*gSize); construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, &gaugeParam, dslash_type); const double cos_pi_3 = 0.5; // Cos(pi/3) const double sin_pi_3 = sqrt(0.75); // Sin(pi/3) for(int dir=0; dir<4; ++dir){ for(int i=0; i<V; ++i){ for(int j=0; j<gaugeSiteSize; ++j){ if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; }else{ ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; } } } } ColorSpinorParam csParam; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } #ifdef MULTI_GPU int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; // FIXME: currently assume staggered is SU(3) gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); ghost_longlink = (void**)cpuLong->Ghost(); gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #endif double time0 = -((double)clock()); // Start the timer double nrm2=0; double src2=0; int ret = 0; switch(test_type){ case 0: //even if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: #define NUM_OFFSETS 12 { double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205}; inv_param.num_offset = NUM_OFFSETS; // these can be set independently for (int i=0; i<inv_param.num_offset; i++) { inv_param.tol_offset[i] = inv_param.tol; inv_param.tol_hq_offset[i] = inv_param.tol_hq; } void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < inv_param.num_offset; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < inv_param.num_offset; i++){ outArray[i] = spinorOutArray[i]->V(); inv_param.offset[i] = 4*masses[i]*masses[i]; } len=Vh; if (test_type == 3) { inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else { inv_param.matpc_type = QUDA_MATPC_ODD_ODD; } invertMultiShiftQuda(outArray, in->V(), &inv_param); cudaDeviceSynchronize(); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity = QUDA_INVALID_PARITY; if (inv_param.solve_type == QUDA_NORMOP_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < inv_param.num_offset;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr); //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){ ret |=1; } } for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i]; } break; default: errorQuda("Unsupported test type"); }//switch if (test_type <=2){ double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); } end(); return ret; }
void init(int argc, char **argv) { cuda_prec = prec; gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gauge_param.X[0] = xdim; gauge_param.X[1] = ydim; gauge_param.X[2] = zdim; gauge_param.X[3] = tdim; if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_setDims(gauge_param.X, myLs); kernelPackT = true; } else { setDims(gauge_param.X); Ls = 1; kernelPackT = false; } setSpinorSiteSize(24); gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = link_recon; gauge_param.reconstruct_sloppy = link_recon; gauge_param.cuda_prec_sloppy = cuda_prec; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.kappa = 0.1; if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { inv_param.mu = 0.01; inv_param.twist_flavor = QUDA_TWIST_MINUS; } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { inv_param.mass = 0.01; inv_param.m5 = -1.5; kappa5 = 0.5/(5 + inv_param.m5); } inv_param.Ls = Ls; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = dagger; inv_param.cpu_prec = cpu_prec; if (inv_param.cpu_prec != gauge_param.cpu_prec) { errorQuda("Gauge and spinor CPU precisions must match"); } inv_param.cuda_prec = cuda_prec; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; #ifndef MULTI_GPU // free parameter for single GPU gauge_param.ga_pad = 0; #else // must be this one c/b face for multi gpu int x_face_size = Ls*gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = Ls*gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = Ls*gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = Ls*gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; #endif inv_param.sp_pad = 0; inv_param.cl_pad = 0; //inv_param.sp_pad = 24*24*24; //inv_param.cl_pad = 24*24*24; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis inv_param.dirac_order = QUDA_DIRAC_ORDER; switch(test_type) { case 0: case 1: inv_param.solution_type = QUDA_MATPC_SOLUTION; break; case 2: inv_param.solution_type = QUDA_MAT_SOLUTION; break; case 3: inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; break; case 4: inv_param.solution_type = QUDA_MATDAG_MAT_SOLUTION; break; default: errorQuda("Test type %d not defined\n", test_type); } inv_param.dslash_type = dslash_type; if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; //if (test_type > 0) { hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); hostCloverInv = hostClover; // fake it /*} else { hostClover = NULL; hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); }*/ } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { } //inv_param.verbosity = QUDA_VERBOSE; // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor = 3; csParam.nSpin = 4; if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { csParam.twistFlavor = inv_param.twist_flavor; } csParam.nDim = 4; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { csParam.nDim = 5; csParam.x[4] = Ls; } csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2 || test_type ==3) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; //csParam.verbose = QUDA_DEBUG_VERBOSE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); spinorTmp = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gauge_param.X[0]; printfQuda("Randomizing fields... "); if (strcmp(latfile,"")) { // load in the command line supplied gauge field read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param); } else { // else generate a random SU(3) field construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); } spinor->Source(QUDA_RANDOM_SOURCE); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { double norm = 0.0; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal if (test_type == 2 || test_type == 4) { construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); } else { construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); } } printfQuda("done.\n"); fflush(stdout); initQuda(device); printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { printfQuda("Sending clover field to GPU\n"); loadCloverQuda(hostClover, hostCloverInv, &inv_param); //clover = cudaCloverPrecise; } if (!transfer) { csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (csParam.precision == QUDA_DOUBLE_PRECISION ) { csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; } else { /* Single and half */ csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; } if (test_type < 2 || test_type == 3) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); tmp1 = new cudaColorSpinorField(csParam); if (test_type == 2 || test_type == 4) csParam.x[0] /= 2; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp2 = new cudaColorSpinorField(csParam); printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; double cpu_norm = norm2(*spinor); double cuda_norm = norm2(*cudaSpinor); printfQuda("Source: CPU = %e, CUDA = %e\n", cpu_norm, cuda_norm); bool pc = (test_type != 2 && test_type != 4); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_VERBOSE; diracParam.tmp1 = tmp1; diracParam.tmp2 = tmp2; dirac = Dirac::create(diracParam); } else { double cpu_norm = norm2(*spinor); printfQuda("Source: CPU = %e\n", cpu_norm); } }
int main(int argc, char **argv) { for (int i = 1; i < argc; i++){ if(process_command_line_option(argc, argv, &i) == 0){ continue; } printfQuda("ERROR: Invalid option:%s\n", argv[i]); usage(argv); } if (prec_sloppy == QUDA_INVALID_PRECISION){ prec_sloppy = prec; } if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){ link_recon_sloppy = link_recon; } // initialize QMP/MPI, QUDA comms grid and RNG (test_util.cpp) initComms(argc, argv, gridsize_from_cmdline); display_test_info(); // *** QUDA parameters begin here. if (dslash_type != QUDA_WILSON_DSLASH && dslash_type != QUDA_CLOVER_WILSON_DSLASH && dslash_type != QUDA_TWISTED_MASS_DSLASH && dslash_type != QUDA_DOMAIN_WALL_4D_DSLASH && dslash_type != QUDA_MOBIUS_DWF_DSLASH && dslash_type != QUDA_TWISTED_CLOVER_DSLASH && dslash_type != QUDA_DOMAIN_WALL_DSLASH) { printfQuda("dslash_type %d not supported\n", dslash_type); exit(0); } QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = prec; QudaPrecision cuda_prec_sloppy = prec_sloppy; QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION; QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double kappa5; gauge_param.X[0] = xdim; gauge_param.X[1] = ydim; gauge_param.X[2] = zdim; gauge_param.X[3] = tdim; inv_param.Ls = 1; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = link_recon; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = link_recon_sloppy; gauge_param.cuda_prec_precondition = cuda_prec_precondition; gauge_param.reconstruct_precondition = link_recon_sloppy; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.dslash_type = dslash_type; inv_param.mass = mass; inv_param.kappa = 1.0 / (2.0 * (1 + 3/gauge_param.anisotropy + mass)); if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.mu = 0.12; inv_param.epsilon = 0.1385; inv_param.twist_flavor = twist_flavor; inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) ? 2 : 1; } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { inv_param.m5 = -1.8; kappa5 = 0.5/(5 + inv_param.m5); inv_param.Ls = Lsdim; } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) { inv_param.m5 = -1.8; kappa5 = 0.5/(5 + inv_param.m5); inv_param.Ls = Lsdim; for(int k = 0; k < Lsdim; k++) { // b5[k], c[k] values are chosen for arbitrary values, // but the difference of them are same as 1.0 inv_param.b_5[k] = 1.452; inv_param.c_5[k] = 0.452; } } // offsets used only by multi-shift solver inv_param.num_offset = 4; double offset[4] = {0.01, 0.02, 0.03, 0.04}; for (int i=0; i<inv_param.num_offset; i++) inv_param.offset[i] = offset[i]; inv_param.inv_type = inv_type; if (inv_param.dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.solution_type = QUDA_MAT_SOLUTION; } else { inv_param.solution_type = multishift ? QUDA_MATPCDAG_MATPC_SOLUTION : QUDA_MATPC_SOLUTION; } inv_param.matpc_type = matpc_type; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = normalization; inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH || dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH || multishift || inv_type == QUDA_CG_INVERTER) { inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; } else { inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; } inv_param.pipeline = 0; inv_param.Nsteps = 2; inv_param.gcrNkrylov = 10; inv_param.tol = 1e-7; inv_param.tol_restart = 1e-3; //now theoretical background for this parameter... #if __COMPUTE_CAPABILITY__ >= 200 // require both L2 relative and heavy quark residual to determine convergence inv_param.residual_type = static_cast<QudaResidualType>(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL); inv_param.tol_hq = 1e-3; // specify a tolerance for the residual for heavy quark residual #else // Pre Fermi architecture only supports L2 relative residual norm inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL; #endif // these can be set individually for (int i=0; i<inv_param.num_offset; i++) { inv_param.tol_offset[i] = inv_param.tol; inv_param.tol_hq_offset[i] = inv_param.tol_hq; } inv_param.maxiter = 10000; inv_param.reliable_delta = 1e-1; inv_param.use_sloppy_partial_accumulator = 0; inv_param.max_res_increase = 1; // domain decomposition preconditioner parameters inv_param.inv_type_precondition = precon_type; inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; inv_param.precondition_cycle = 1; inv_param.tol_precondition = 1e-1; inv_param.maxiter_precondition = 10; inv_param.verbosity_precondition = QUDA_SILENT; inv_param.cuda_prec_precondition = cuda_prec_precondition; inv_param.omega = 1.0; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; gauge_param.ga_pad = 0; // 24*24*24/2; inv_param.sp_pad = 0; // 24*24*24/2; inv_param.cl_pad = 0; // 24*24*24/2; // For multi-GPU, ga_pad must be large enough to store a time-slice #ifdef MULTI_GPU int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; #endif if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; inv_param.clover_coeff = 1.5*inv_param.kappa; } inv_param.verbosity = QUDA_VERBOSE; // *** Everything between here and the call to initQuda() is // *** application-specific. // set parameters for the reference Dslash, and prepare fields to be loaded if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { dw_setDims(gauge_param.X, inv_param.Ls); } else { setDims(gauge_param.X); } setSpinorSiteSize(24); size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); void *gauge[4], *clover_inv=0, *clover=0; for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*gSize); } if (strcmp(latfile,"")) { // load in the command line supplied gauge field read_gauge_field(latfile, gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); construct_gauge_field(gauge, 2, gauge_param.cpu_prec, &gauge_param); } else { // else generate a random SU(3) field construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param); } if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { double norm = 0.0; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); clover_inv = malloc(V*cloverSiteSize*cSize); construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec); // The uninverted clover term is only needed when solving the unpreconditioned // system or when using "asymmetric" even/odd preconditioning. int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE || inv_param.solve_type == QUDA_NORMOP_PC_SOLVE); int asymmetric = preconditioned && (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); if (!preconditioned) { clover = clover_inv; clover_inv = NULL; } else if (asymmetric) { // fake it by using the same random matrix clover = clover_inv; // for both clover and clover_inv } else { clover = NULL; } } void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls); void *spinorCheck = malloc(V*spinorSiteSize*sSize*inv_param.Ls); void *spinorOut = NULL, **spinorOutMulti = NULL; if (multishift) { spinorOutMulti = (void**)malloc(inv_param.num_offset*sizeof(void *)); for (int i=0; i<inv_param.num_offset; i++) { spinorOutMulti[i] = malloc(V*spinorSiteSize*sSize*inv_param.Ls); } } else { spinorOut = malloc(V*spinorSiteSize*sSize*inv_param.Ls); } memset(spinorIn, 0, inv_param.Ls*V*spinorSiteSize*sSize); memset(spinorCheck, 0, inv_param.Ls*V*spinorSiteSize*sSize); if (multishift) { for (int i=0; i<inv_param.num_offset; i++) memset(spinorOutMulti[i], 0, inv_param.Ls*V*spinorSiteSize*sSize); } else { memset(spinorOut, 0, inv_param.Ls*V*spinorSiteSize*sSize); } // create a point source at 0 (in each subvolume... FIXME) // create a point source at 0 (in each subvolume... FIXME) if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) { //((float*)spinorIn)[0] = 1.0; for (int i=0; i<inv_param.Ls*V*spinorSiteSize; i++) ((float*)spinorIn)[i] = rand() / (float)RAND_MAX; } else { //((double*)spinorIn)[0] = 1.0; for (int i=0; i<inv_param.Ls*V*spinorSiteSize; i++) ((double*)spinorIn)[i] = rand() / (double)RAND_MAX; } // start the timer double time0 = -((double)clock()); // initialize the QUDA library initQuda(device); // load the gauge field loadGaugeQuda((void*)gauge, &gauge_param); // load the clover term, if desired if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param); // perform the inversion if (multishift) { invertMultiShiftQuda(spinorOutMulti, spinorIn, &inv_param); } else { invertQuda(spinorOut, spinorIn, &inv_param); } // stop the timer time0 += clock(); time0 /= CLOCKS_PER_SEC; printfQuda("Device memory used:\n Spinor: %f GiB\n Gauge: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { printfQuda(" Clover: %f GiB\n", inv_param.cloverGiB); } printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0); if (multishift) { if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { errorQuda("Mass normalization not supported for multi-shift solver in invert_test"); } void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls); printfQuda("Host residuum checks: \n"); for(int i=0; i < inv_param.num_offset; i++) { ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else { printfQuda("Domain wall not supported for multi-shift\n"); exit(-1); } axpy(inv_param.offset[i], spinorOutMulti[i], spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); mxpy(spinorIn, spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(spinorIn, Vh*spinorSiteSize, inv_param.cpu_prec); double l2r = sqrt(nrm2 / src2); printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n", i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i]); } free(spinorTmp); } else { if (inv_param.solution_type == QUDA_MAT_SOLUTION) { if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, inv_param.cpu_prec, gauge_param); else { int tm_offset = V*spinorSiteSize; //12*spinorRef->Volume(); void *evenOut = spinorCheck; void *oddOut = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset); void *evenIn = spinorOut; void *oddIn = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset); tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param); } } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); // } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { // dw_4d_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); // } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) { // mdw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); } else { printfQuda("Unsupported dslash_type\n"); exit(-1); } if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { ax(0.5/kappa5, spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH && twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { ax(0.5/inv_param.kappa, spinorCheck, 2*V*spinorSiteSize, inv_param.cpu_prec); } else { ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); } } } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) { if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { dw_4d_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) { double *kappa_b, *kappa_c; kappa_b = (double*)malloc(Lsdim*sizeof(double)); kappa_c = (double*)malloc(Lsdim*sizeof(double)); for(int xs = 0 ; xs < Lsdim ; xs++) { kappa_b[xs] = 1.0/(2*(inv_param.b_5[xs]*(4.0 + inv_param.m5) + 1.0)); kappa_c[xs] = 1.0/(2*(inv_param.c_5[xs]*(4.0 + inv_param.m5) - 1.0)); } mdw_matpc(spinorCheck, gauge, spinorOut, kappa_b, kappa_c, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); free(kappa_b); free(kappa_c); } else { printfQuda("Unsupported dslash_type\n"); exit(-1); } if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { ax(0.25/(kappa5*kappa5), spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); } else { ax(0.25/(inv_param.kappa*inv_param.kappa), spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); } } } else if (inv_param.solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls); ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else { printfQuda("Unsupported dslash_type\n"); exit(-1); } if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { errorQuda("Mass normalization not implemented"); } free(spinorTmp); } int vol = inv_param.solution_type == QUDA_MAT_SOLUTION ? V : Vh; mxpy(spinorIn, spinorCheck, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); double nrm2 = norm_2(spinorCheck, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); double src2 = norm_2(spinorIn, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); double l2r = sqrt(nrm2 / src2); printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n", inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq); } freeGaugeQuda(); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda(); // finalize the QUDA library endQuda(); finalizeComms(); return 0; }
void init() { initQuda(device); gaugeParam = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gaugeParam.X[0] = X[0] = xdim; gaugeParam.X[1] = X[1] = ydim; gaugeParam.X[2] = X[2] = zdim; gaugeParam.X[3] = X[3] = tdim; setDims(gaugeParam.X); setSpinorSiteSize(6); gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cuda_prec = prec; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.anisotropy = 1.0; gaugeParam.tadpole_coeff = 0.8; gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER; gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T; gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; gaugeParam.gaugeGiB = 0; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = prec; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dagger = dagger; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dslash_type = QUDA_ASQTAD_DSLASH; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]); tmpint = MAX(tmpint, X[0]*X[1]*X[3]); tmpint = MAX(tmpint, X[0]*X[1]*X[2]); gaugeParam.ga_pad = tmpint; inv_param.sp_pad = tmpint; ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2) { inv_param.solution_type = QUDA_MATPC_SOLUTION; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { inv_param.solution_type = QUDA_MAT_SOLUTION; csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered csParam.create = QUDA_ZERO_FIELD_CREATE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gaugeParam.X[0]; printfQuda("Randomizing fields ...\n"); spinor->Source(QUDA_RANDOM_SOURCE); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } if (fatlink == NULL || longlink == NULL){ errorQuda("ERROR: malloc failed for fatlink/longlink"); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); cpuFat->exchangeGhost(); ghost_fatlink = cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); cpuLong->exchangeGhost(); ghost_longlink = cpuLong->Ghost(); int x_face_size = X[1]*X[2]*X[3]/2; int y_face_size = X[0]*X[2]*X[3]/2; int z_face_size = X[0]*X[1]*X[3]/2; int t_face_size = X[0]*X[1]*X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gaugeParam.ga_pad = pad_size; #endif gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; printfQuda("Fat links sending..."); loadGaugeQuda(fatlink, &gaugeParam); printfQuda("Fat links sent\n"); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; #ifdef MULTI_GPU gaugeParam.ga_pad = 3*pad_size; #endif gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = link_recon; printfQuda("Long links sending..."); loadGaugeQuda(longlink, &gaugeParam); printfQuda("Long links sent...\n"); printfQuda("Sending fields to GPU..."); if (!transfer) { //csParam.verbose = QUDA_DEBUG_VERBOSE; csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (test_type < 2){ csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /=2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; cudaDeviceSynchronize(); checkCudaError(); double spinor_norm2 = norm2(*spinor); double cuda_spinor_norm2= norm2(*cudaSpinor); printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2); if(test_type == 2){ csParam.x[0] /=2; } csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp = new cudaColorSpinorField(csParam); bool pc = (test_type != 2); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_VERBOSE; diracParam.tmp1=tmp; dirac = Dirac::create(diracParam); } else { errorQuda("Error not suppported"); } return; }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double mass = 0.1; set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setSpinorSiteSize(6); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); for (int dir = 0; dir < 4; dir++) { for(int i = 0; i < V*gaugeSiteSize; i++) { if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) { ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; } else { ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; } } } ColorSpinorParam csParam; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) { constructSpinorField((float*)in->V()); } else { constructSpinorField((double*)in->V()); } int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); cpuFat->exchangeGhost(); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); cpuLong->exchangeGhost(); ghost_longlink = (void**)cpuLong->Ghost(); #endif if(test_type == 6) { record_gauge(gaugeParam.X, fatlink, fat_pad, longlink, link_pad, link_recon, link_recon_sloppy, &gaugeParam); } else { #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #endif } double time0 = -((double)clock()); // Start the timer double nrm2=0; double src2=0; int ret = 0; switch(test_type) { case 0: //even inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: case 5: case 6: #define NUM_OFFSETS 12 double masses[NUM_OFFSETS] = {5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81, 3.0, 3.1, 3.2, 3.3, 3.4}; double offsets[NUM_OFFSETS]; int num_offsets =NUM_OFFSETS; void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1; i < num_offsets; i++) { spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0; i < num_offsets; i++) { outArray[i] = spinorOutArray[i]->V(); } for (int i=0; i< num_offsets; i++) { offsets[i] = 4*masses[i]*masses[i]; } len=Vh; if (test_type == 3 || test_type == 6) { inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (test_type == 4) { inv_param.matpc_type = QUDA_MATPC_ODD_ODD; } else { //test_type ==5 errorQuda("test 5 not supported\n"); } double residue_sq; if (test_type == 6) { invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); } else { invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); } cudaDeviceSynchronize(); printfQuda("Final residue squred =%g\n", residue_sq); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity = QUDA_INVALID_PARITY; if (inv_param.solve_type == QUDA_NORMEQ_SOLVE) { //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); } else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN) { parity = QUDA_EVEN_PARITY; } else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD) { parity = QUDA_ODD_PARITY; } else { errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0; i < num_offsets; i++) { printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 10*inv_param.tol) { ret |=1; } } if (ret ==1) { errorQuda("Converge failed!\n"); } for(int i=1; i < num_offsets; i++) { delete spinorOutArray[i]; } }//switch if (test_type <=2) { printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol) { ret = 1; errorQuda("Convergence failed!\n"); } } end(); return ret; }