void init() { param.cpu_prec = QUDA_DOUBLE_PRECISION; param.cuda_prec = QUDA_SINGLE_PRECISION; param.reconstruct = QUDA_RECONSTRUCT_12; param.cuda_prec_sloppy = param.cuda_prec; param.reconstruct_sloppy = param.reconstruct; param.X[0] = 8; param.X[1] = 8; param.X[2] = 8; param.X[3] = 8; setDims(param.X); param.anisotropy = 1.0; param.t_boundary = QUDA_ANTI_PERIODIC_T; param.gauge_fix = QUDA_GAUGE_FIXED_NO; #ifdef MULTI_GPU param.ga_pad = param.X[0]*param.X[1]*param.X[2]/2; #endif // construct gauge fields for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec); new_gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec); } int dev = 0; initQuda(dev); }
void init() { param.cpu_prec = QUDA_SINGLE_PRECISION; param.cuda_prec = QUDA_HALF_PRECISION; param.reconstruct = QUDA_RECONSTRUCT_8; param.cuda_prec_sloppy = param.cuda_prec; param.reconstruct_sloppy = param.reconstruct; param.X[0] = 4; param.X[1] = 4; param.X[2] = 4; param.X[3] = 4; param.ga_pad = 0; setDims(param.X); param.anisotropy = 2.3; param.t_boundary = QUDA_ANTI_PERIODIC_T; param.gauge_fix = QUDA_GAUGE_FIXED_NO; // construct input fields for (int dir = 0; dir < 4; dir++) { qdpGauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec); } cpsGauge = malloc(4*V*gaugeSiteSize*param.cpu_prec); csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor = 3; csParam.nSpin = 4; csParam.nDim = 4; for (int d=0; d<4; d++) csParam.x[d] = param.X[d]; csParam.precision = QUDA_SINGLE_PRECISION; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; csParam.create = QUDA_NULL_FIELD_CREATE; spinor = new cpuColorSpinorField(csParam); spinor2 = new cpuColorSpinorField(csParam); spinor->Source(QUDA_RANDOM_SOURCE); initQuda(0); csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = 0; csParam.precision = QUDA_HALF_PRECISION; cudaSpinor = new cudaColorSpinorField(csParam); }
void init() { param = newQudaGaugeParam(); param.cpu_prec = QUDA_DOUBLE_PRECISION; param.cuda_prec = prec; param.reconstruct = link_recon; param.cuda_prec_sloppy = prec; param.reconstruct_sloppy = link_recon; param.type = QUDA_WILSON_LINKS; param.tadpole_coeff = 0.8; param.gauge_order = QUDA_QDP_GAUGE_ORDER; param.X[0] = xdim; param.X[1] = ydim; param.X[2] = zdim; param.X[3] = tdim; setDims(param.X); param.anisotropy = 1.0; param.t_boundary = QUDA_PERIODIC_T; param.gauge_fix = QUDA_GAUGE_FIXED_NO; #ifdef MULTI_GPU int x_face_size = param.X[1]*param.X[2]*param.X[3]/2; int y_face_size = param.X[0]*param.X[2]*param.X[3]/2; int z_face_size = param.X[0]*param.X[1]*param.X[3]/2; int t_face_size = param.X[0]*param.X[1]*param.X[2]/2; int pad_size = MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); param.ga_pad = pad_size; #else param.ga_pad = 0; #endif // construct gauge fields for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec); new_gauge[dir] = malloc(V*gaugeSiteSize*param.cpu_prec); } initQuda(device); }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double mass = 0.1; set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setDimConstants(gaugeParam.X); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); for (int dir = 0; dir < 4; dir++) { for(int i = 0;i < V*gaugeSiteSize;i++){ if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; }else{ ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; } } } ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); cpuFat->exchangeGhost(); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); cpuLong->exchangeGhost(); ghost_longlink = (void**)cpuLong->Ghost(); #endif if(testtype == 6){ record_gauge(gaugeParam.X, fatlink, fat_pad, longlink, link_pad, link_recon, link_recon_sloppy, &gaugeParam); }else{ #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #endif } double time0 = -((double)clock()); // Start the timer unsigned long volume = Vh; unsigned long nflops=2*1187; //from MILC's CG routine double nrm2=0; double src2=0; int ret = 0; switch(testtype){ case 0: //even volume = Vh; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); { double sol = norm_2(out->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); double refe = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); } break; case 1: //odd volume = Vh; inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: case 5: case 6: #define NUM_OFFSETS 7 nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81}; double offsets[NUM_OFFSETS]; int num_offsets =NUM_OFFSETS; void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < num_offsets; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < num_offsets; i++){ outArray[i] = spinorOutArray[i]->V(); } for (int i=0; i< num_offsets;i++){ offsets[i] = 4*masses[i]*masses[i]; } len=Vh; volume = Vh; if (testtype == 3 || testtype == 6){ inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (testtype == 4){ inv_param.matpc_type = QUDA_MATPC_ODD_ODD; }else { //testtype ==5 errorQuda("test 5 not supported\n"); } double residue_sq; if (testtype == 6){ invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); }else{ invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); } cudaThreadSynchronize(); printfQuda("Final residue squred =%g\n", residue_sq); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity; if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < num_offsets;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret |=1; } } if (ret ==1){ errorQuda("Converge failed!\n"); } for(int i=1; i < num_offsets;i++){ delete spinorOutArray[i]; } }//switch if (testtype <=2){ printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret = 1; errorQuda("Convergence failed!\n"); } } end(); return ret; }
void init(int argc, char **argv) { kernelPackT = false; // Set true for kernel T face packing cuda_prec= prec; gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gauge_param.X[0] = xdim; gauge_param.X[1] = ydim; gauge_param.X[2] = zdim; gauge_param.X[3] = tdim; setDims(gauge_param.X); gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = link_recon; gauge_param.reconstruct_sloppy = link_recon; gauge_param.cuda_prec_sloppy = cuda_prec; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.kappa = 0.1; if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { inv_param.mu = 0.01; inv_param.twist_flavor = QUDA_TWIST_MINUS; } inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = dagger; inv_param.cpu_prec = cpu_prec; if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor cpu precisions must match"); inv_param.cuda_prec = cuda_prec; #ifndef MULTI_GPU // free parameter for single GPU gauge_param.ga_pad = 0; #else // must be this one c/b face for multi gpu int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; #endif inv_param.sp_pad = 0; inv_param.cl_pad = 0; //inv_param.sp_pad = 24*24*24; //inv_param.cl_pad = 24*24*24; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) { inv_param.solution_type = QUDA_MAT_SOLUTION; } else { inv_param.solution_type = QUDA_MATPC_SOLUTION; } inv_param.dslash_type = dslash_type; if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; //if (test_type > 0) { hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); hostCloverInv = hostClover; // fake it /*} else { hostClover = NULL; hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); }*/ } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { } //inv_param.verbosity = QUDA_VERBOSE; // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor = 3; csParam.nSpin = 4; if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { csParam.twistFlavor = inv_param.twist_flavor; } csParam.nDim = 4; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; //csParam.verbose = QUDA_DEBUG_VERBOSE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gauge_param.X[0]; printfQuda("Randomizing fields... "); if (strcmp(latfile,"")) { // load in the command line supplied gauge field read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param); } else { // else generate a random SU(3) field construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); } spinor->Source(QUDA_RANDOM_SOURCE); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { double norm = 0.0; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal if (test_type == 2) { construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); } else { construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); } } printfQuda("done.\n"); fflush(stdout); initQuda(device); printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { printfQuda("Sending clover field to GPU\n"); loadCloverQuda(hostClover, hostCloverInv, &inv_param); //clover = cudaCloverPrecise; } if (!transfer) { csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (csParam.precision == QUDA_DOUBLE_PRECISION ) { csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; } else { /* Single and half */ csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; } if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); if (test_type == 2) csParam.x[0] /= 2; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp1 = new cudaColorSpinorField(csParam); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_MASS_DSLASH) { tmp2 = new cudaColorSpinorField(csParam); } printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << norm2(*cudaSpinor) << std::endl; bool pc = (test_type != 2); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_VERBOSE; diracParam.tmp1 = tmp1; diracParam.tmp2 = tmp2; dirac = Dirac::create(diracParam); } else { std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; } }
static void gauge_force_test(void) { int max_length = 6; initQuda(device); setVerbosityQuda(QUDA_VERBOSE,"",stdout); qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.cpu_prec = link_prec; qudaGaugeParam.cuda_prec = link_prec; qudaGaugeParam.cuda_prec_sloppy = link_prec; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.reconstruct_sloppy = link_recon; qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; qudaGaugeParam.ga_pad = 0; qudaGaugeParam.mom_ga_pad = 0; size_t gSize = qudaGaugeParam.cpu_prec; void* sitelink; void* sitelink_1d; #ifdef GPU_DIRECT sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize); #else sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize); #endif // this is a hack to have site link generated in 2d // then copied to 1d array in "MILC" format void* sitelink_2d[4]; #ifdef GPU_DIRECT for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #else for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); #endif // fills the gauge field with random numbers createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0); //copy the 2d sitelink to 1d milc format for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V; i++){ char* src = ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } if (qudaGaugeParam.gauge_order == QUDA_MILC_GAUGE_ORDER){ sitelink = sitelink_1d; }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) { sitelink = (void**)sitelink_2d; } else { errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order); } #ifdef MULTI_GPU void* sitelink_ex_2d[4]; void* sitelink_ex_1d; sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize); for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize); int X1= Z[0]; int X2= Z[1]; int X3= Z[2]; int X4= Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ continue; } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink_2d[dir]; char* dst = (char*)sitelink_ex_2d[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i for(int dir = 0; dir < 4; dir++){ for(int i=0; i < V_ex; i++){ char* src = ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec; char* dst = ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ; memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec); } } #endif void* mom = safe_malloc(4*V*momSiteSize*gSize); void* refmom = safe_malloc(4*V*momSiteSize*gSize); memset(mom, 0, 4*V*momSiteSize*gSize); //initialize some data in cpuMom createMomCPU(mom, qudaGaugeParam.cpu_prec); memcpy(refmom, mom, 4*V*momSiteSize*gSize); double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)]; for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){ loop_coeff_d[i] = loop_coeff_f[i]; } void* loop_coeff; if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){ loop_coeff = (void*)&loop_coeff_f[0]; }else{ loop_coeff = loop_coeff_d; } double eb3 = 0.3; int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]); int** input_path_buf[4]; for(int dir =0; dir < 4; dir++){ input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*)); for(int i=0;i < num_paths;i++){ input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int)); if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int)); else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int)); else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int)); else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int)); } } if (tune) { printfQuda("Tuning...\n"); setTuning(QUDA_TUNE_YES); } struct timeval t0, t1; double timeinfo[3]; /* Multiple execution to exclude warmup time in the first run*/ for (int i =0;i < attempts; i++){ gettimeofday(&t0, NULL); computeGaugeForceQuda(mom, sitelink, input_path_buf, length, loop_coeff_d, num_paths, max_length, eb3, &qudaGaugeParam, timeinfo); gettimeofday(&t1, NULL); } double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec); //The number comes from CPU implementation in MILC, gauge_force_imp.c int flops=153004; if (verify_results){ for(int i = 0;i < attempts;i++){ #ifdef MULTI_GPU //last arg=0 means no optimization for communication, i.e. exchange data in all directions //even they are not partitioned int R[4] = {2, 2, 2, 2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #else gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec, input_path_buf, length, loop_coeff, num_paths); #endif } int res; res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec); strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec); printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED"); } double perf = 1.0* flops*V/(total_time*1e+9); double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9); printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f total time =%.2f ms\n", timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3); printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf); for(int dir = 0; dir < 4; dir++){ for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]); host_free(input_path_buf[dir]); } host_free(sitelink_1d); for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]); #ifdef MULTI_GPU host_free(sitelink_ex_1d); for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]); #endif host_free(mom); host_free(refmom); endQuda(); }
static int unitarize_link_test() { QudaGaugeParam qudaGaugeParam = newQudaGaugeParam(); initQuda(0); cpu_prec = prec; gSize = cpu_prec; qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); QudaPrecision link_prec = QUDA_SINGLE_PRECISION; QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO; qudaGaugeParam.cpu_prec = link_prec; qudaGaugeParam.cuda_prec = link_prec; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.type = QUDA_WILSON_LINKS; hisq::fermion_force::hisqForceInitCuda(&qudaGaugeParam); qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.cuda_prec_sloppy = prec; qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; qudaGaugeParam.ga_pad = 0; qudaGaugeParam.packed_size = 0; qudaGaugeParam.gaugeGiB = 0; qudaGaugeParam.flag = false; qudaGaugeParam.cpu_prec = cpu_prec; qudaGaugeParam.cuda_prec = prec; qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.type=QUDA_WILSON_LINKS; qudaGaugeParam.reconstruct = link_recon; qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE | QUDA_FAT_PRESERVE_GPU_GAUGE | QUDA_FAT_PRESERVE_COMM_MEM; setFatLinkPadding(QUDA_COMPUTE_FAT_STANDARD, &qudaGaugeParam); GaugeFieldParam gParam(0, qudaGaugeParam); gParam.pad = 0; gParam.create = QUDA_REFERENCE_FIELD_CREATE; gParam.link_type = QUDA_WILSON_LINKS; gParam.order = QUDA_MILC_GAUGE_ORDER; cpuGaugeField *cpuOutLink = new cpuGaugeField(gParam); gParam.pad = 0; gParam.create = QUDA_NULL_FIELD_CREATE; gParam.link_type = QUDA_WILSON_LINKS; gParam.order = QUDA_QDP_GAUGE_ORDER; gParam.reconstruct = QUDA_RECONSTRUCT_NO; cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam); cudaGaugeField *cudaULink = new cudaGaugeField(gParam); initCommonConstants(*cudaFatLink); void* fatlink = (void*)malloc(4*V*gaugeSiteSize*gSize); if(fatlink == NULL){ errorQuda("ERROR: allocating fatlink failed\n"); } void* sitelink[4]; for(int i=0;i < 4;i++){ cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize); if(sitelink[i] == NULL){ errorQuda("ERROR; allocate sitelink[%d] failed\n", i); } } createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); double act_path_coeff[6]; act_path_coeff[0] = 0.625000; act_path_coeff[1] = -0.058479; act_path_coeff[2] = -0.087719; act_path_coeff[3] = 0.030778; act_path_coeff[4] = -0.007200; act_path_coeff[5] = -0.123113; //only record the last call's performance //the first one is for creating the cpu/cuda data structures if(gauge_order == QUDA_QDP_GAUGE_ORDER){ computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam, QUDA_COMPUTE_FAT_STANDARD); } // gauge order is QDP_GAUGE_ORDER cpuOutLink->setGauge((void**)fatlink); cudaFatLink->loadCPUField(*cpuOutLink, QUDA_CPU_FIELD_LOCATION); hisq::setUnitarizeLinksConstants(unitarize_eps, max_allowed_error, reunit_allow_svd, reunit_svd_only, svd_rel_error, svd_abs_error); hisq::setUnitarizeLinksPadding(0,0); int* num_failures_dev; cudaMalloc(&num_failures_dev, sizeof(int)); cudaMemset(num_failures_dev, 0, sizeof(int)); struct timeval t0, t1; gettimeofday(&t0,NULL); hisq::unitarizeLinksCuda(qudaGaugeParam,*cudaFatLink, cudaULink, num_failures_dev); cudaThreadSynchronize(); gettimeofday(&t1,NULL); int num_failures=0; cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost); delete cudaFatLink; delete cudaULink; for(int dir=0; dir<4; ++dir) cudaFreeHost(sitelink[dir]); cudaFree(num_failures_dev); #ifdef MULTI_GPU exchange_llfat_cleanup(); #endif endQuda(); printfQuda("Unitarization time: %g ms\n", TDIFF(t0,t1)*1000); return num_failures; }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double mass = 0.95; set_params(&gaugeParam, &inv_param, sdim, sdim, sdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setDimConstants(gaugeParam.X); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); for (int dir = 0; dir < 4; dir++) { for(int i = 0;i < V*gaugeSiteSize;i++){ if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; }else{ ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; } } } #ifdef MULTI_GPU //exchange_init_dims(gaugeParam.X); int ghost_link_len[4] = { Vs_x*gaugeSiteSize*gSize, Vs_y*gaugeSiteSize*gSize, Vs_z*gaugeSiteSize*gSize, Vs_t*gaugeSiteSize*gSize }; for(int i=0;i < 4;i++){ ghost_fatlink[i] = malloc(ghost_link_len[i]); ghost_longlink[i] = malloc(3*ghost_link_len[i]); if (ghost_fatlink[i] == NULL || ghost_longlink[i] == NULL){ printf("ERROR: malloc failed for ghost fatlink or ghost longlink\n"); exit(1); } } //exchange_cpu_links4dir(fatlink, ghost_fatlink, longlink, ghost_longlink, gaugeParam.cpu_prec); void *fat_send[4], *long_send[4]; for(int i=0;i < 4;i++){ fat_send[i] = malloc(ghost_link_len[i]); long_send[i] = malloc(3*ghost_link_len[i]); } set_dim(Z); pack_ghost(fatlink, fat_send, 1, gaugeParam.cpu_prec); pack_ghost(longlink, long_send, 3, gaugeParam.cpu_prec); int dummyFace = 1; FaceBuffer faceBuf (Z, 4, 18, dummyFace, gaugeParam.cpu_prec); faceBuf.exchangeCpuLink((void**)ghost_fatlink, (void**)fat_send, 1); faceBuf.exchangeCpuLink((void**)ghost_longlink, (void**)long_send, 3); for (int i=0; i<4; i++) { free(fat_send[i]); free(long_send[i]); } #endif ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->v); }else{ constructSpinorField((double*)in->v); } #ifdef MULTI_GPU if(testtype == 6){ record_gauge(fatlink, ghost_fatlink[3], Vsh_t, longlink, ghost_longlink[3], 3*Vsh_t, link_recon, link_recon_sloppy, &gaugeParam); }else{ gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2); gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = 3*MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2); gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #endif double time0 = -((double)clock()); // Start the timer unsigned long volume = Vh; unsigned long nflops=2*1187; //from MILC's CG routine double nrm2=0; double src2=0; int ret = 0; switch(testtype){ case 0: //even volume = Vh; inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->v, in->v, &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_EVEN_PARITY); #endif mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd volume = Vh; inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->v, in->v, &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_ODD_PARITY); #endif mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: case 5: case 6: #define NUM_OFFSETS 4 nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33}; double offsets[NUM_OFFSETS]; int num_offsets =NUM_OFFSETS; void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < num_offsets; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < num_offsets; i++){ outArray[i] = spinorOutArray[i]->v; } for (int i=0; i< num_offsets;i++){ offsets[i] = 4*masses[i]*masses[i]; } len=Vh; volume = Vh; inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; if (testtype == 3){ inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (testtype == 4||testtype == 6){ inv_param.matpc_type = QUDA_MATPC_ODD_ODD; }else { //testtype ==5 errorQuda("test 5 not supported\n"); } double residue_sq; if (testtype == 6){ //invertMultiShiftQudaMixed(spinorOutArray, in->v, &inv_param, offsets, num_offsets, &residue_sq); }else{ invertMultiShiftQuda(outArray, in->v, &inv_param, offsets, num_offsets, &residue_sq); } cudaThreadSynchronize(); printfQuda("Final residue squred =%g\n", residue_sq); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity; if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < num_offsets;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->v, fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, parity); #endif mxpy(in->v, ref->v, len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->v, len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->v, len*mySpinorSiteSize, inv_param.cpu_prec); printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret |=1; errorQuda("Converge failed!\n"); } } for(int i=1; i < num_offsets;i++){ delete spinorOutArray[i]; } }//switch if (testtype <=2){ printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret = 1; errorQuda("Convergence failed!\n"); } } end(); return ret; }
int main(int argc, char **argv) { // set QUDA parameters int device = 0; // CUDA device number QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = QUDA_SINGLE_PRECISION; QudaPrecision cuda_prec_sloppy = QUDA_HALF_PRECISION; QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); gauge_param.X[0] = 16; gauge_param.X[1] = 16; gauge_param.X[2] = 16; gauge_param.X[3] = 16; inv_param.Ls = 16; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = QUDA_RECONSTRUCT_12; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH; inv_param.inv_type = QUDA_CG_INVERTER; inv_param.mass = 0.01; inv_param.m5 = -1.5; double kappa5 = 0.5/(5 + inv_param.m5); inv_param.tol = 5e-8; inv_param.maxiter = 1000; inv_param.reliable_delta = 0.1; inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.prec_precondition = cuda_prec_sloppy; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.dirac_tune = QUDA_TUNE_YES; inv_param.preserve_dirac = QUDA_PRESERVE_DIRAC_YES; gauge_param.ga_pad = 0; // 24*24*24; inv_param.sp_pad = 0; // 24*24*24; inv_param.cl_pad = 0; // 24*24*24; inv_param.verbosity = QUDA_VERBOSE; // Everything between here and the call to initQuda() is application-specific. // set parameters for the reference Dslash, and prepare fields to be loaded setDims(gauge_param.X, inv_param.Ls); size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); void *gauge[4]; for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*gSize); } construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param); void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls); void *spinorOut = malloc(V*spinorSiteSize*sSize*inv_param.Ls); void *spinorCheck = malloc(V*spinorSiteSize*sSize*inv_param.Ls); // create a point source at 0 if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) *((float*)spinorIn) = 1.0; else *((double*)spinorIn) = 1.0; // start the timer double time0 = -((double)clock()); // initialize the QUDA library initQuda(device); // load the gauge field loadGaugeQuda((void*)gauge, &gauge_param); // perform the inversion invertQuda(spinorOut, spinorIn, &inv_param); // stop the timer time0 += clock(); time0 /= CLOCKS_PER_SEC; printf("Device memory used:\n Spinor: %f GiB\n Gauge: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); printf("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0); if (inv_param.solution_type == QUDA_MAT_SOLUTION) { mat(spinorCheck, gauge, spinorOut, kappa5, 0, inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) ax(0.5/kappa5, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) { matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param.cpu_prec, inv_param.mass); if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) ax(0.25/(kappa5*kappa5), spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); } mxpy(spinorIn, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(spinorIn, V*spinorSiteSize, inv_param.cpu_prec); printf("Relative residual: requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); // finalize the QUDA library endQuda(); return 0; }
void _initQuda() { if( quda_initialized ) return; if( g_debug_level > 0 ) if(g_proc_id == 0) printf("\n# QUDA: Detected QUDA version %d.%d.%d\n\n", QUDA_VERSION_MAJOR, QUDA_VERSION_MINOR, QUDA_VERSION_SUBMINOR); if( QUDA_VERSION_MAJOR == 0 && QUDA_VERSION_MINOR < 7) { fprintf(stderr, "Error: minimum QUDA version required is 0.7.0 (for support of chiral basis and removal of bug in mass normalization with preconditioning).\n"); exit(-2); } gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); // *** QUDA parameters begin here (sloppy prec. will be adjusted in invert) QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION; QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION; QudaTune tune = QUDA_TUNE_YES; // *** the remainder should not be changed for this application // local lattice size #if USE_LZ_LY_LX_T gauge_param.X[0] = LZ; gauge_param.X[1] = LY; gauge_param.X[2] = LX; gauge_param.X[3] = T; #else gauge_param.X[0] = LX; gauge_param.X[1] = LY; gauge_param.X[2] = LZ; gauge_param.X[3] = T; #endif inv_param.Ls = 1; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = 18; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = 18; gauge_param.cuda_prec_precondition = cuda_prec_precondition; gauge_param.reconstruct_precondition = 18; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; inv_param.pipeline = 0; inv_param.gcrNkrylov = 10; // require both L2 relative and heavy quark residual to determine convergence // inv_param.residual_type = (QudaResidualType)(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL); inv_param.tol_hq = 1.0;//1e-3; // specify a tolerance for the residual for heavy quark residual inv_param.reliable_delta = 1e-2; // ignored by multi-shift solver // domain decomposition preconditioner parameters inv_param.inv_type_precondition = QUDA_CG_INVERTER; inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; inv_param.precondition_cycle = 1; inv_param.tol_precondition = 1e-1; inv_param.maxiter_precondition = 10; inv_param.verbosity_precondition = QUDA_SILENT; inv_param.cuda_prec_precondition = cuda_prec_precondition; inv_param.omega = 1.0; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; inv_param.gamma_basis = QUDA_CHIRAL_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; gauge_param.ga_pad = 0; // 24*24*24/2; inv_param.sp_pad = 0; // 24*24*24/2; inv_param.cl_pad = 0; // 24*24*24/2; // For multi-GPU, ga_pad must be large enough to store a time-slice int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; // solver verbosity if( g_debug_level == 0 ) inv_param.verbosity = QUDA_SILENT; else if( g_debug_level == 1 ) inv_param.verbosity = QUDA_SUMMARIZE; else inv_param.verbosity = QUDA_VERBOSE; // general verbosity setVerbosityQuda( QUDA_SUMMARIZE, "# QUDA: ", stdout); // declare the grid mapping used for communications in a multi-GPU grid #if USE_LZ_LY_LX_T int grid[4] = {g_nproc_z, g_nproc_y, g_nproc_x, g_nproc_t}; #else int grid[4] = {g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t}; #endif initCommsGridQuda(4, grid, commsMap, NULL); // alloc gauge_quda size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { gauge_quda[dir] = (double*) malloc(VOLUME*18*gSize); if(gauge_quda[dir] == NULL) { fprintf(stderr, "_initQuda: malloc for gauge_quda[dir] failed"); exit(-2); } } // alloc space for a temp. spinor, used throughout this module tempSpinor = (double*)malloc( 2*VOLUME*24*sizeof(double) ); /* factor 2 for doublet */ if(tempSpinor == NULL) { fprintf(stderr, "_initQuda: malloc for tempSpinor failed"); exit(-2); } // initialize the QUDA library #ifdef MPI initQuda(-1); //sets device numbers automatically #else initQuda(0); //scalar build: use device 0 #endif quda_initialized = 1; }
void init() { initQuda(device); gaugeParam = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gaugeParam.X[0] = X[0] = xdim; gaugeParam.X[1] = X[1] = ydim; gaugeParam.X[2] = X[2] = zdim; gaugeParam.X[3] = X[3] = tdim; setDims(gaugeParam.X); setSpinorSiteSize(6); gaugeParam.cpu_prec = QUDA_DOUBLE_PRECISION; gaugeParam.cuda_prec = prec; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = gaugeParam.reconstruct; gaugeParam.cuda_prec_sloppy = gaugeParam.cuda_prec; gaugeParam.anisotropy = 1.0; gaugeParam.tadpole_coeff = 0.8; gaugeParam.gauge_order = QUDA_QDP_GAUGE_ORDER; gaugeParam.t_boundary = QUDA_ANTI_PERIODIC_T; gaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; gaugeParam.gaugeGiB = 0; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cuda_prec = prec; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dagger = dagger; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dslash_type = QUDA_ASQTAD_DSLASH; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; int tmpint = MAX(X[1]*X[2]*X[3], X[0]*X[2]*X[3]); tmpint = MAX(tmpint, X[0]*X[1]*X[3]); tmpint = MAX(tmpint, X[0]*X[1]*X[2]); gaugeParam.ga_pad = tmpint; inv_param.sp_pad = tmpint; ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2) { inv_param.solution_type = QUDA_MATPC_SOLUTION; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { inv_param.solution_type = QUDA_MAT_SOLUTION; csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; // this parameter is meaningless for staggered csParam.create = QUDA_ZERO_FIELD_CREATE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gaugeParam.X[0]; printfQuda("Randomizing fields ...\n"); spinor->Source(QUDA_RANDOM_SOURCE); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } if (fatlink == NULL || longlink == NULL){ errorQuda("ERROR: malloc failed for fatlink/longlink"); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); cpuFat->exchangeGhost(); ghost_fatlink = cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); cpuLong->exchangeGhost(); ghost_longlink = cpuLong->Ghost(); int x_face_size = X[1]*X[2]*X[3]/2; int y_face_size = X[0]*X[2]*X[3]/2; int z_face_size = X[0]*X[1]*X[3]/2; int t_face_size = X[0]*X[1]*X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gaugeParam.ga_pad = pad_size; #endif gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; printfQuda("Fat links sending..."); loadGaugeQuda(fatlink, &gaugeParam); printfQuda("Fat links sent\n"); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; #ifdef MULTI_GPU gaugeParam.ga_pad = 3*pad_size; #endif gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = link_recon; printfQuda("Long links sending..."); loadGaugeQuda(longlink, &gaugeParam); printfQuda("Long links sent...\n"); printfQuda("Sending fields to GPU..."); if (!transfer) { //csParam.verbose = QUDA_DEBUG_VERBOSE; csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (test_type < 2){ csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /=2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; cudaDeviceSynchronize(); checkCudaError(); double spinor_norm2 = norm2(*spinor); double cuda_spinor_norm2= norm2(*cudaSpinor); printfQuda("Source CPU = %f, CUDA=%f\n", spinor_norm2, cuda_spinor_norm2); if(test_type == 2){ csParam.x[0] /=2; } csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp = new cudaColorSpinorField(csParam); bool pc = (test_type != 2); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_VERBOSE; diracParam.tmp1=tmp; dirac = Dirac::create(diracParam); } else { errorQuda("Error not suppported"); } return; }
int main(int argc, char *argv[]) { int seed = atoi(argv[1]); int device = atoi(argv[2]); initQuda(device); Start(&argc,&argv); DoArg do_arg; setup_do_arg(do_arg, seed, NSITES_3D, NSITES_T, BETA); GJP.Initialize(do_arg); //VRB.DeactivateAll(); GwilsonFclover lat; CommonArg c_arg; //Declare args for Gaussian Smearing QPropWGaussArg g_arg; g_arg.gauss_link_smear_type=GAUSS_LS_TYPE; //Link smearing g_arg.gauss_link_smear_coeff=GAUSS_LS_COEFF; //Link smearing g_arg.gauss_link_smear_N=GAUSS_LS_N; //Link smearing hits g_arg.gauss_N = GAUSS_N; //Source/Sink smearing hits g_arg.gauss_W = sqrt(KAPPA*4*g_arg.gauss_N); //Smearing parameter. char is_qu[5]; #ifdef QUENCH GhbArg ghb_arg; ghb_arg.num_iter = 1; AlgGheatBath hb(lat, &c_arg, &ghb_arg); strcpy(is_qu,"QUEN"); #else HmdArg hmd_arg; setup_hmd_arg(hmd_arg); AlgHmcPhi hmc(lat, &c_arg, &hmd_arg); strcpy(is_qu,"UNQU"); #endif int sweep_counter = 0; int total_updates = NTHERM + NSKIP*(NDATA-1); QPropWArg arg0; arg0.t=0; arg0.x=0; arg0.y=0; arg0.z=0; arg0.cg.mass = MASS; arg0.cg.stop_rsd = STOP_RSD; arg0.cg.max_num_iter = MAX_NUM_ITER; arg0.cg.Inverter = INVERTER_TYPE; arg0.cg.bicgstab_n = BICGSTAB_N; int x2[4]; WilsonMatrix t4; Float d0_t4t4c_re_tr = 0.0; int x2_idx = 0; int vol3d = pow(NSITES_3D,3); char lattice[256]; //lattice config file char file[256]; //output file ////////////////////// // Start simulation // ////////////////////// while (sweep_counter < total_updates) { for (int n = 1; n <= NSKIP; n++) { #ifdef READ //do nothing #else #ifdef QUENCH hb.run(); #else hmc.run(); #endif #endif sweep_counter++; if (!UniqueID()) { printf("step %d complete\n",sweep_counter); fflush(stdout); } } if (sweep_counter == NTHERM) printf("thermalization complete. \n"); if (sweep_counter >= NTHERM) { // Use this code to specify a gauge configuration. #ifdef QUENCH sprintf(lattice, LATT_PATH"QU/lat_hb_B%.2f_%d-%d_%d.dat", BETA, NSITES_3D, NSITES_T, sweep_counter); #else sprintf(lattice, LATT_PATH"UNQ/lat_hmc_B%.2f_M%.3f_%d-%d_%d.dat", BETA, NSITES_3D, NSITES_T, sweep_counter); #endif #ifdef READ ReadLatticeParallel(lat,lattice); #else WriteLatticeParallel(lat,lattice); #endif gaugecounter = 1; // Get Point Source Propagator // This will place a unit wall source t plane set at the coordinates // specified by arg0, modulated by a phase set by P. It will then be // smeared using the parameters specified by g_arg. //Set the momentum phase. int P[3] = {P1,P2,P3}; //Smear the source using the parameters set by g_arg. QPropWMomSrcSmeared qprop0(lat, &arg0, P, &g_arg, &c_arg); // Smear the sink with the same g_arg parameters. qprop0.GaussSmearSinkProp(g_arg); //Sum over x2 for (x2[3]=0; x2[3]<GJP.TnodeSites(); x2[3]++) { //Reinitialise trace d0_t4t4c_re_tr *= 0.0; for (x2[2]=0; x2[2]<GJP.ZnodeSites(); x2[2]++) for (x2[1]=0; x2[1]<GJP.YnodeSites(); x2[1]++) for (x2[0]=0; x2[0]<GJP.XnodeSites(); x2[0]++) { x2_idx = lat.GsiteOffset(x2)/4; //Get propagator sinked at x2. t4 = qprop0[x2_idx]; //Get the real part of the trace. d0_t4t4c_re_tr += MMDag_re_tr(t4); } ////////////////////////// // Write trace to file. // ////////////////////////// //Write data file so that the data can be reproduced from the name of the file. sprintf(file, DATAPATH"MOM_%d%d%d_GPU_%d_B%.2f_M%.3f_N%d_W%.3f_n%d_xi%.2f_1pion_%s_stout_%d-%d.dat", P[0], P[1], P[2], seed, BETA, MASS, g_arg.gauss_N, g_arg.gauss_W, g_arg.gauss_link_smear_N, g_arg.gauss_link_smear_coeff, is_qu, NSITES_3D, NSITES_T); FILE *t4tr=Fopen(file,"a"); Fprintf(t4tr,"%d %d %d %.16e\n", sweep_counter, x2[3], 0, d0_t4t4c_re_tr); Fclose(t4tr); cout<<"time slice = "<<x2[3]<<" complete."<<endl; ////////////////////////////////////////// // End trace summation at time slice t. // ////////////////////////////////////////// } } } //////////////////// // End simulation // //////////////////// //End(); endQuda(); return 0; }
int main(int argc, char **argv) { int c, i, mu, status; int ispin, icol, isc; int n_c = 3; int n_s = 4; int count = 0; int filename_set = 0; int dims[4] = {0,0,0,0}; int grid_size[4]; int l_LX_at, l_LXstart_at; int x0, x1, x2, x3, ix, iix, iy, is, it, i3; int sl0, sl1, sl2, sl3, have_source_flag=0; int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3; int check_residuum = 0; unsigned int VOL3, V5; int do_gt = 0; int full_orbit = 0; int smear_source = 0; char filename[200], source_filename[200], source_filename_write[200]; double ratime, retime; double plaq_r=0., plaq_m=0., norm, norm2; double spinor1[24]; double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL; double _1_2_kappa, _2_kappa, phase; FILE *ofs; int mu_trans[4] = {3, 0, 1, 2}; int threadid, nthreads; int timeslice, source_timeslice; char rng_file_in[100], rng_file_out[100]; int *source_momentum=NULL; int source_momentum_class = -1; int source_momentum_no = 0; int source_momentum_runs = 1; int imom; int num_gpu_on_node=0, rank; int source_location_5d_iseven; int convert_sign=0; #ifdef HAVE_QUDA int rotate_gamma_basis = 1; #else int rotate_gamma_basis = 0; #endif omp_lock_t *lck = NULL, gen_lck[1]; int key = 0; /****************************************************************************/ /* for smearing parallel to inversion */ double *smearing_spinor_field[] = {NULL,NULL}; int dummy_flag = 0; /****************************************************************************/ /****************************************************************************/ #if (defined HAVE_QUDA) && (defined MULTI_GPU) int x_face_size, y_face_size, z_face_size, t_face_size, pad_size; #endif /****************************************************************************/ /************************************************/ int qlatt_nclass; int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL; double **qlatt_list=NULL; /************************************************/ /************************************************/ double boundary_condition_factor; int boundary_condition_factor_set = 0; /************************************************/ //#ifdef MPI // kernelPackT = true; //#endif /*********************************************** * QUDA parameters ***********************************************/ #ifdef HAVE_QUDA QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION; QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); #endif while ((c = getopt(argc, argv, "soch?vgf:p:b:S:R:")) != -1) { switch (c) { case 'v': g_verbose = 1; break; case 'g': do_gt = 1; break; case 'f': strcpy(filename, optarg); filename_set=1; break; case 'c': check_residuum = 1; fprintf(stdout, "# [invert_dw_quda] will check residuum again\n"); break; case 'p': n_c = atoi(optarg); fprintf(stdout, "# [invert_dw_quda] will use number of colors = %d\n", n_c); break; case 'o': full_orbit = 1; fprintf(stdout, "# [invert_dw_quda] will invert for full orbit, if source momentum set\n"); case 's': smear_source = 1; fprintf(stdout, "# [invert_dw_quda] will smear the sources if they are read from file\n"); break; case 'b': boundary_condition_factor = atof(optarg); boundary_condition_factor_set = 1; fprintf(stdout, "# [invert_dw_quda] const. boundary condition factor set to %e\n", boundary_condition_factor); break; case 'S': convert_sign = atoi(optarg); fprintf(stdout, "# [invert_dw_quda] using convert sign %d\n", convert_sign); break; case 'R': rotate_gamma_basis = atoi(optarg); fprintf(stdout, "# [invert_dw_quda] rotate gamma basis %d\n", rotate_gamma_basis); break; case 'h': case '?': default: usage(); break; } } // get the time stamp g_the_time = time(NULL); /************************************** * set the default values, read input **************************************/ if(filename_set==0) strcpy(filename, "cvc.input"); if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename); read_input_parser(filename); #ifdef MPI #ifdef HAVE_QUDA grid_size[0] = g_nproc_x; grid_size[1] = g_nproc_y; grid_size[2] = g_nproc_z; grid_size[3] = g_nproc_t; fprintf(stdout, "# [] g_nproc = (%d,%d,%d,%d)\n", g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t); initCommsQuda(argc, argv, grid_size, 4); #else MPI_Init(&argc, &argv); #endif #endif #if (defined PARALLELTX) || (defined PARALLELTXY) EXIT_WITH_MSG(1, "[] Error, 2-dim./3-dim. MPI-Version not yet implemented"); #endif // some checks on the input data if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) { if(g_proc_id==0) fprintf(stderr, "[invert_dw_quda] Error, T and L's must be set\n"); usage(); } // set number of openmp threads // initialize MPI parameters mpi_init(argc, argv); // the volume of a timeslice VOL3 = LX*LY*LZ; V5 = T*LX*LY*LZ*L5; g_kappa5d = 0.5 / (5. + g_m5); if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] kappa5d = %e\n", g_kappa5d); fprintf(stdout, "# [%2d] parameters:\n"\ "# [%2d] T = %3d\n"\ "# [%2d] Tstart = %3d\n"\ "# [%2d] L5 = %3d\n",\ g_cart_id, g_cart_id, T, g_cart_id, Tstart, g_cart_id, L5); #ifdef MPI if(T==0) { fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id); MPI_Abort(MPI_COMM_WORLD, 1); MPI_Finalize(); exit(2); } #endif if(init_geometry() != 0) { fprintf(stderr, "[invert_dw_quda] Error from init_geometry\n"); EXIT(1); } geometry(); if( init_geometry_5d() != 0 ) { fprintf(stderr, "[invert_dw_quda] Error from init_geometry_5d\n"); EXIT(2); } geometry_5d(); /************************************** * initialize the QUDA library **************************************/ if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] initializing quda\n"); #ifdef HAVE_QUDA // cudaGetDeviceCount(&num_gpu_on_node); if(g_gpu_per_node<0) { if(g_cart_id==0) fprintf(stderr, "[] Error, number of GPUs per node not set\n"); EXIT(106); } else { num_gpu_on_node = g_gpu_per_node; } #ifdef MPI rank = comm_rank(); #else rank = 0; #endif g_gpu_device_number = rank % num_gpu_on_node; fprintf(stdout, "# [] process %d/%d uses device %d\n", rank, g_cart_id, g_gpu_device_number); initQuda(g_gpu_device_number); #endif /************************************** * prepare the gauge field **************************************/ // read the gauge field from file alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND); if(strcmp( gaugefilename_prefix, "identity")==0 ) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up unit gauge field\n"); for(ix=0;ix<VOLUME; ix++) { for(mu=0;mu<4;mu++) { _cm_eq_id(g_gauge_field+_GGI(ix,mu)); } } } else if(strcmp( gaugefilename_prefix, "random")==0 ) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Setting up random gauge field with seed = %d\n", g_seed); init_rng_state(g_seed, &g_rng_state); random_gauge_field(g_gauge_field, 1.); plaquette(&plaq_m); sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf); check_error(write_lime_gauge_field(filename, plaq_m, Nconf, 64), "write_lime_gauge_field", NULL, 12); } else { if(g_gauge_file_format == 0) { // ILDG sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf); if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename); status = read_lime_gauge_field_doubleprec(filename); } else if(g_gauge_file_format == 1) { // NERSC sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf); if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename); status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r); //status = read_nersc_gauge_field_3x3(g_gauge_field, filename, &plaq_r); } if(status != 0) { fprintf(stderr, "[invert_dw_quda] Error, could not read gauge field"); EXIT(12); } } #ifdef MPI xchange_gauge(); #endif // measure the plaquette plaquette(&plaq_m); if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m); if(g_cart_id==0) fprintf(stdout, "# Read plaquette value : %25.16e\n", plaq_r); #ifndef HAVE_QUDA if(N_Jacobi>0) { #endif // allocate the smeared / qdp ordered gauge field alloc_gauge_field(&gauge_field_smeared, VOLUMEPLUSRAND); for(i=0;i<4;i++) { gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME; } #ifndef HAVE_QUDA } #endif #ifdef HAVE_QUDA // transcribe the gauge field omp_set_num_threads(g_num_threads); #pragma omp parallel for private(ix,iy,mu) for(ix=0;ix<VOLUME;ix++) { iy = g_lexic2eot[ix]; for(mu=0;mu<4;mu++) { _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu)); } } // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition) if(g_proc_coords[0]==g_nproc_t-1) { if(!boundary_condition_factor_set) boundary_condition_factor = -1.; fprintf(stdout, "# [] process %d multiplies gauge-field timeslice T_global-1 with boundary condition factor %e\n", g_cart_id, boundary_condition_factor); omp_set_num_threads(g_num_threads); #pragma omp parallel for private(ix,iy) for(ix=0;ix<VOL3;ix++) { iix = (T-1)*VOL3 + ix; iy = g_lexic2eot[iix]; _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.); } } // QUDA precision parameters switch(g_cpu_prec) { case 0: cpu_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = half\n"); break; case 1: cpu_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = single\n"); break; case 2: cpu_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] CPU prec = double\n"); break; default: cpu_prec = QUDA_DOUBLE_PRECISION; break; } switch(g_gpu_prec) { case 0: cuda_prec = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = half\n"); break; case 1: cuda_prec = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = single\n"); break; case 2: cuda_prec = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU prec = double\n"); break; default: cuda_prec = QUDA_DOUBLE_PRECISION; break; } switch(g_gpu_prec_sloppy) { case 0: cuda_prec_sloppy = QUDA_HALF_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = half\n"); break; case 1: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = single\n"); break; case 2: cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; if(g_cart_id==0) fprintf(stdout, "# [] GPU sloppy prec = double\n"); break; default: cuda_prec_sloppy = QUDA_SINGLE_PRECISION; break; } // QUDA gauge parameters gauge_param.X[0] = LX; gauge_param.X[1] = LY; gauge_param.X[2] = LZ; gauge_param.X[3] = T; inv_param.Ls = L5; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = QUDA_RECONSTRUCT_12; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; gauge_param.ga_pad = 0; inv_param.sp_pad = 0; inv_param.cl_pad = 0; // For multi-GPU, ga_pad must be large enough to store a time-slice #ifdef MULTI_GPU x_face_size = inv_param.Ls * gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; y_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; z_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; t_face_size = inv_param.Ls * gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; pad_size = _MAX(x_face_size, y_face_size); pad_size = _MAX(pad_size, z_face_size); pad_size = _MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; if(g_cart_id==0) printf("# [invert_dw_quda] pad_size = %d\n", pad_size); #endif // load the gauge field if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] loading gauge field\n"); loadGaugeQuda((void*)gauge_qdp, &gauge_param); gauge_qdp[0] = NULL; gauge_qdp[1] = NULL; gauge_qdp[2] = NULL; gauge_qdp[3] = NULL; #endif /********************************************* * APE smear the gauge field *********************************************/ if(N_Jacobi>0) { memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUMEPLUSRAND*sizeof(double)); fprintf(stdout, "# [invert_dw_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape); APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape); xchange_gauge_field(gauge_field_smeared); } // allocate memory for the spinor fields #ifdef HAVE_QUDA no_fields = 3+2; #else no_fields = 6+2; #endif g_spinor_field = (double**)calloc(no_fields, sizeof(double*)); for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND*L5); smearing_spinor_field[0] = g_spinor_field[no_fields-2]; smearing_spinor_field[1] = g_spinor_field[no_fields-1]; switch(g_source_type) { case 0: case 5: // the source locaton sl0 = g_source_location / (LX_global*LY_global*LZ); sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / ( LY_global*LZ); sl2 = ( g_source_location % ( LY_global*LZ) ) / ( LZ); sl3 = g_source_location % LZ; if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3); source_proc_coords[0] = sl0 / T; source_proc_coords[1] = sl1 / LX; source_proc_coords[2] = sl2 / LY; source_proc_coords[3] = sl3 / LZ; #ifdef MPI MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id); #else g_source_proc_id = 0; #endif have_source_flag = g_source_proc_id == g_cart_id; lsl0 = sl0 % T; lsl1 = sl1 % LX; lsl2 = sl2 % LY; lsl3 = sl3 % LZ; if(have_source_flag) { fprintf(stdout, "# [invert_dw_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3); } break; case 2: case 3: case 4: // the source timeslice #ifdef MPI source_proc_coords[0] = g_source_timeslice / T; source_proc_coords[1] = 0; source_proc_coords[2] = 0; source_proc_coords[3] = 0; MPI_Cart_rank(g_cart_grid, source_proc_coords, &g_source_proc_id); have_source_flag = ( g_source_proc_id == g_cart_id ); source_timeslice = have_source_flag ? g_source_timeslice % T : -1; #else g_source_proc_id = 0; have_source_flag = 1; source_timeslice = g_source_timeslice; #endif break; } #ifdef HAVE_QUDA /************************************************************* * QUDA inverter parameters *************************************************************/ inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH; if(strcmp(g_inverter_type_name, "cg") == 0) { inv_param.inv_type = QUDA_CG_INVERTER; if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using cg inverter\n"); } else if(strcmp(g_inverter_type_name, "bicgstab") == 0) { inv_param.inv_type = QUDA_BICGSTAB_INVERTER; if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using bicgstab inverter\n"); #ifdef MULTI_GPU } else if(strcmp(g_inverter_type_name, "gcr") == 0) { inv_param.inv_type = QUDA_GCR_INVERTER; if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] using gcr inverter\n"); #endif } else { if(g_cart_id==0) fprintf(stderr, "[invert_dw_quda] Error, unrecognized inverter type %s\n", g_inverter_type_name); EXIT(123); } if(inv_param.inv_type == QUDA_CG_INVERTER) { inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE; } else if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER) { inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; } else { inv_param.solution_type = QUDA_MATPC_SOLUTION; inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; } inv_param.m5 = g_m5; inv_param.kappa = 0.5 / (5. + inv_param.m5); inv_param.mass = g_m0; inv_param.tol = solver_precision; inv_param.maxiter = niter_max; inv_param.reliable_delta = reliable_delta; #ifdef MPI // domain decomposition preconditioner parameters if(inv_param.inv_type == QUDA_GCR_INVERTER) { if(g_cart_id == 0) printf("# [] settup DD parameters\n"); inv_param.gcrNkrylov = 15; inv_param.inv_type_precondition = QUDA_MR_INVERTER; inv_param.tol_precondition = 1e-6; inv_param.maxiter_precondition = 200; inv_param.verbosity_precondition = QUDA_VERBOSE; inv_param.prec_precondition = cuda_prec_sloppy; inv_param.omega = 0.7; } #endif inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.verbosity = QUDA_VERBOSE; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.dirac_order = QUDA_DIRAC_ORDER; #ifdef MPI inv_param.preserve_dirac = QUDA_PRESERVE_DIRAC_YES; inv_param.prec_precondition = cuda_prec_sloppy; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_tune = QUDA_TUNE_NO; #endif #endif /******************************************* * write initial rng state to file *******************************************/ if( g_source_type==2 && g_coherent_source==2 ) { sprintf(rng_file_out, "%s.0", g_rng_filename); status = init_rng_stat_file (g_seed, rng_file_out); if( status != 0 ) { fprintf(stderr, "[invert_dw_quda] Error, could not write rng status\n"); EXIT(210); } } else if( (g_source_type==2 /*&& g_coherent_source==1*/) || g_source_type==3 || g_source_type==4) { if( init_rng_state(g_seed, &g_rng_state) != 0 ) { fprintf(stderr, "[invert_dw_quda] Error, could initialize rng state\n"); EXIT(211); } } /******************************************* * prepare locks for openmp *******************************************/ nthreads = g_num_threads - 1; lck = (omp_lock_t*)malloc(nthreads * sizeof(omp_lock_t)); if(lck == NULL) { EXIT_WITH_MSG(97, "[invert_dw_quda] Error, could not allocate lck\n"); } // init locks for(i=0;i<nthreads;i++) { omp_init_lock(lck+i); } omp_init_lock(gen_lck); // check the source momenta if(g_source_momentum_set) { source_momentum = (int*)malloc(3*sizeof(int)); if(g_source_momentum[0]<0) g_source_momentum[0] += LX_global; if(g_source_momentum[1]<0) g_source_momentum[1] += LY_global; if(g_source_momentum[2]<0) g_source_momentum[2] += LZ_global; fprintf(stdout, "# [invert_dw_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]); if(full_orbit) { status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map); if(status != 0) { if(g_cart_id==0) fprintf(stderr, "\n[invert_dw_quda] Error while creating O_3-lists\n"); EXIT(4); } source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]]; source_momentum_no = qlatt_count[source_momentum_class]; source_momentum_runs = source_momentum_class==0 ? 1 : source_momentum_no + 1; if(g_cart_id==0) fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n", source_momentum_class, source_momentum_no, source_momentum_runs); } } if(g_source_type == 5) { if(g_seq_source_momentum_set) { if(g_seq_source_momentum[0]<0) g_seq_source_momentum[0] += LX_global; if(g_seq_source_momentum[1]<0) g_seq_source_momentum[1] += LY_global; if(g_seq_source_momentum[2]<0) g_seq_source_momentum[2] += LZ_global; } else if(g_source_momentum_set) { g_seq_source_momentum[0] = g_source_momentum[0]; g_seq_source_momentum[1] = g_source_momentum[1]; g_seq_source_momentum[2] = g_source_momentum[2]; } fprintf(stdout, "# [invert_dw_quda] using final sequential source momentum ( %d, %d, %d )\n", g_seq_source_momentum[0], g_seq_source_momentum[1], g_seq_source_momentum[2]); } /*********************************************** * loop on spin-color-index ***********************************************/ for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++) // for(isc=g_source_index[0]; isc<=g_source_index[0]; isc++) { ispin = isc / n_c; icol = isc % n_c; for(imom=0; imom<source_momentum_runs; imom++) { /*********************************************** * set source momentum ***********************************************/ if(g_source_momentum_set) { if(imom == 0) { if(full_orbit) { source_momentum[0] = 0; source_momentum[1] = 0; source_momentum[2] = 0; } else { source_momentum[0] = g_source_momentum[0]; source_momentum[1] = g_source_momentum[1]; source_momentum[2] = g_source_momentum[2]; } } else { source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY_global*LZ_global); source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY_global*LZ_global) ) / LZ_global; source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ_global; } if(g_cart_id==0) fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n", imom, source_momentum[0], source_momentum[1], source_momentum[2]); } /*********************************************** * prepare the souce ***********************************************/ if(g_read_source == 0) { // create source switch(g_source_type) { case 0: // point source if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating point source\n"); for(ix=0;ix<L5*VOLUME;ix++) { _fv_eq_zero(g_spinor_field[0]+ix); } if(have_source_flag) { if(g_source_momentum_set) { phase = 2*M_PI*( source_momentum[0]*sl1/(double)LX_global + source_momentum[1]*sl2/(double)LY_global + source_momentum[2]*sl3/(double)LZ_global ); g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol) ] = cos(phase); g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol)+1] = sin(phase); } else { g_spinor_field[0][_GSI(g_ipt[lsl0][lsl1][lsl2][lsl3]) + 2*(n_c*ispin+icol) ] = 1.; } } if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol); } #ifdef HAVE_QUDA // set matpc_tpye source_location_5d_iseven = ( (g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin<n_s/2) || (!g_iseven[g_ipt[lsl0][lsl1][lsl2][lsl3]] && ispin>=n_s/2) ) ? 1 : 0; if(source_location_5d_iseven) { inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_EVEN_EVEN\n"); } else { inv_param.matpc_type = QUDA_MATPC_ODD_ODD; if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] matpc type is MATPC_ODD_ODD\n"); } #endif break; case 2: // timeslice source if(g_coherent_source==1) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating coherent timeslice source\n"); status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1); if(status != 0) { fprintf(stderr, "[invert_dw_quda] Error from prepare source, status was %d\n", status); #ifdef MPI MPI_Abort(MPI_COMM_WORLD, 123); MPI_Finalize(); #endif exit(123); } check_error(prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_state, 1), "prepare_coherent_timeslice_source", NULL, 123); timeslice = g_coherent_source_base; } else { if(g_coherent_source==2) { timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global; fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n"); check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, g_rng_state, 1), "prepare_timeslice_source", NULL, 123); } else { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] Creating timeslice source\n"); check_error(prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_state, 1), "prepare_timeslice_source", NULL, 124); timeslice = g_source_timeslice; } } if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc); } break; case 3: // timeslice sources for one-end trick (spin dilution) fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n"); check_error( prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum, isc%n_s, g_rng_state, \ ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end", NULL, 125 ); c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s; if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c); } break; case 4: // timeslice sources for one-end trick (spin and color dilution ) fprintf(stdout, "# [invert_dw_quda] Creating timeslice source for one-end-trick\n"); check_error(prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, source_timeslice, source_momentum,\ isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1) && imom==source_momentum_runs-1 )), "prepare_timeslice_source_one_end_color", NULL, 126); c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c); if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c); } break; case 5: if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] preparing sequential point source\n"); check_error( prepare_sequential_point_source (g_spinor_field[0], isc, sl0, g_seq_source_momentum, smear_source, g_spinor_field[1], gauge_field_smeared), "prepare_sequential_point_source", NULL, 33); sprintf(source_filename, "%s.%.4d.t%.2dx%.2d.y%.2d.z%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]); break; default: fprintf(stderr, "\nError, unrecognized source type\n"); exit(32); break; } } else { // read source switch(g_source_type) { case 0: // point source if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \ filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc); } fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename); check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115); break; case 2: // timeslice source if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc); } fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename); check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115); break; default: check_error(1, "source type", NULL, 104); break; case -1: // timeslice source sprintf(source_filename, "%s", filename_prefix2); fprintf(stdout, "# [invert_dw_quda] reading source from file %s\n", source_filename); check_error(read_lime_spinor(g_spinor_field[0], source_filename, 0), "read_lime_spinor", NULL, 115); break; } } // of if g_read_source if(g_write_source) { check_error(write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision), "write_propagator", NULL, 27); } /*********************************************************************************************** * here threads split: ***********************************************************************************************/ if(dummy_flag==0) strcpy(source_filename_write, source_filename); memcpy((void*)(smearing_spinor_field[0]), (void*)(g_spinor_field[0]), 24*VOLUME*sizeof(double)); if(dummy_flag>0) { // copy only if smearing has been done; otherwise do not copy, do not invert if(g_cart_id==0) fprintf(stdout, "# [] copy smearing field -> g field\n"); memcpy((void*)(g_spinor_field[0]), (void*)(smearing_spinor_field[1]), 24*VOLUME*sizeof(double)); } omp_set_num_threads(g_num_threads); #pragma omp parallel private(threadid, _2_kappa, is, ix, iy, iix, ratime, retime) shared(key,g_read_source, smear_source, N_Jacobi, kappa_Jacobi, smearing_spinor_field, g_spinor_field, nthreads, convert_sign, VOLUME, VOL3, T, L5, isc, rotate_gamma_basis, g_cart_id) firstprivate(inv_param, gauge_param, ofs) { threadid = omp_get_thread_num(); if(threadid < nthreads) { fprintf(stdout, "# [] proc%.2d thread%.2d starting source preparation\n", g_cart_id, threadid); // smearing if( ( !g_read_source || (g_read_source && smear_source ) ) && N_Jacobi > 0 ) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] smearing source with N_Jacobi=%d, kappa_Jacobi=%e\n", N_Jacobi, kappa_Jacobi); Jacobi_Smearing_threaded(gauge_field_smeared, smearing_spinor_field[0], smearing_spinor_field[1], kappa_Jacobi, N_Jacobi, threadid, nthreads); } /*********************************************** * create the 5-dim. source field ***********************************************/ if(convert_sign == 0) { spinor_4d_to_5d_threaded(smearing_spinor_field[0], smearing_spinor_field[0], threadid, nthreads); } else if(convert_sign == 1 || convert_sign == -1) { spinor_4d_to_5d_sign_threaded(smearing_spinor_field[0], smearing_spinor_field[0], convert_sign, threadid, nthreads); } for(is=0; is<L5; is++) { for(it=threadid; it<T; it+=nthreads) { memcpy((void*)(g_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), (void*)(smearing_spinor_field[0]+_GSI(g_ipt_5d[is][it][0][0][0])), VOL3*24*sizeof(double)); } } // reorder, multiply with g2 for(is=0; is<L5; is++) { for(it=threadid; it<T; it+=nthreads) { for(i3=0; i3<VOL3; i3++) { ix = (is*T+it)*VOL3 + i3; _fv_eq_zero(smearing_spinor_field[1]+_GSI(ix)); }}} if(rotate_gamma_basis) { for(it=threadid; it<T; it+=nthreads) { for(i3=0; i3<VOL3; i3++) { ix = it * VOL3 + i3; iy = lexic2eot_5d(0, ix); _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix)); }} for(it=threadid; it<T; it+=nthreads) { for(i3=0; i3<VOL3; i3++) { ix = it * VOL3 + i3; iy = lexic2eot_5d(L5-1, ix); _fv_eq_gamma_ti_fv(smearing_spinor_field[1]+_GSI(iy), 2, smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME)); }} } else { for(it=threadid; it<T; it+=nthreads) { for(i3=0; i3<VOL3; i3++) { ix = it * VOL3 + i3; iy = lexic2eot_5d(0, ix); _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix)); }} for(it=threadid; it<T; it+=nthreads) { for(i3=0; i3<VOL3; i3++) { ix = it * VOL3 + i3; iy = lexic2eot_5d(L5-1, ix); _fv_eq_fv(smearing_spinor_field[1]+_GSI(iy), smearing_spinor_field[0]+_GSI(ix+(L5-1)*VOLUME)); }} } fprintf(stdout, "# [] proc%.2d thread%.2d finished source preparation\n", g_cart_id, threadid); } else if(threadid == g_num_threads-1 && dummy_flag > 0) { // else branch on threadid fprintf(stdout, "# [] proc%.2d thread%.2d starting inversion for dummy_flag = %d\n", g_cart_id, threadid, dummy_flag); /*********************************************** * perform the inversion ***********************************************/ if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n"); xchange_field_5d(g_spinor_field[0]); memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double)); ratime = CLOCK; #ifdef MPI if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER || inv_param.inv_type == QUDA_GCR_INVERTER) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n"); invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param); } else if(inv_param.inv_type == QUDA_CG_INVERTER) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n"); testCG(g_spinor_field[1], g_spinor_field[0], &inv_param); } else { if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n"); } #else invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param); #endif retime = CLOCK; if(g_cart_id==0) { fprintf(stdout, "# [invert_dw_quda] QUDA time: %e seconds\n", inv_param.secs); fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs); fprintf(stdout, "# [invert_dw_quda] wall time: %e seconds\n", retime-ratime); fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); } } // of if threadid // wait till all threads are here #pragma omp barrier if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) { _2_kappa = 2. * g_kappa5d; for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) { _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa ); } } #pragma omp barrier // reorder, multiply with g2 for(is=0;is<L5;is++) { for(ix=threadid; ix<VOLUME; ix+=g_num_threads) { iy = lexic2eot_5d(is, ix); iix = is*VOLUME + ix; _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy)); }} #pragma omp barrier if(rotate_gamma_basis) { for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) { _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix)); } } else { for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) { _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix)); } } if(g_cart_id==0 && threadid==g_num_threads-1) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime); #pragma omp single { #ifdef MPI xchange_field_5d(g_spinor_field[1]); #endif /*********************************************** * check residuum ***********************************************/ if(check_residuum && dummy_flag>0) { // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg, // which uses the tmLQCD conventions (same as in contractions) // without explicit boundary conditions #ifdef MPI xchange_field_5d(g_spinor_field[2]); xchange_field_5d(g_spinor_field[1]); #endif memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double)); //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id); //ofs = fopen(filename, "w"); //printf_spinor_field_5d(g_spinor_field[1], ofs); //fclose(ofs); Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]); for(ix=0;ix<VOLUME*L5;ix++) { _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix)); } spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5); spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5); if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) ); } if(dummy_flag>0) { /*********************************************** * create 4-dim. propagator ***********************************************/ if(convert_sign == 0) { spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]); } else if(convert_sign == -1 || convert_sign == +1) { spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign); } /*********************************************** * write the solution ***********************************************/ sprintf(filename, "%s.inverted", source_filename_write); if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename); check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22); //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id); //ofs = fopen(filename, "w"); //printf_spinor_field(g_spinor_field[1], ofs); //fclose(ofs); } if(check_residuum) memcpy(g_spinor_field[2], smearing_spinor_field[0], 24*VOLUME*L5*sizeof(double)); } // of omp single } // of omp parallel region if(dummy_flag > 0) strcpy(source_filename_write, source_filename); dummy_flag++; } // of loop on momenta } // of isc #if 0 // last inversion { memcpy(g_spinor_field[0], smearing_spinor_field[1], 24*VOLUME*L5*sizeof(double)); if(g_cart_id==0) fprintf(stdout, "# [] proc%.2d starting last inversion\n", g_cart_id); /*********************************************** * perform the inversion ***********************************************/ if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] starting inversion\n"); xchange_field_5d(g_spinor_field[0]); memset(g_spinor_field[1], 0, (VOLUME+RAND)*L5*24*sizeof(double)); ratime = CLOCK; #ifdef MPI if(inv_param.inv_type == QUDA_BICGSTAB_INVERTER || inv_param.inv_type == QUDA_GCR_INVERTER) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling invertQuda\n"); invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param); } else if(inv_param.inv_type == QUDA_CG_INVERTER) { if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] calling testCG\n"); testCG(g_spinor_field[1], g_spinor_field[0], &inv_param); } else { if(g_cart_id==0) fprintf(stderr, "# [invert_dw_quda] unrecognized inverter\n"); } #else invertQuda(g_spinor_field[1], g_spinor_field[0], &inv_param); #endif retime = CLOCK; if(g_cart_id==0) { fprintf(stdout, "# [invert_dw_quda] QUDA time: %e seconds\n", inv_param.secs); fprintf(stdout, "# [invert_dw_quda] QUDA Gflops: %e\n", inv_param.gflops/inv_param.secs); fprintf(stdout, "# [invert_dw_quda] wall time: %e seconds\n", retime-ratime); fprintf(stdout, "# [invert_dw_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); } omp_set_num_threads(g_num_threads); #pragma omp parallel private(threadid,_2_kappa,is,ix,iy,iix) shared(VOLUME,L5,g_kappa,g_spinor_field,g_num_threads) { threadid = omp_get_thread_num(); if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) { _2_kappa = 2. * g_kappa5d; for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) { _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa ); } } #pragma omp barrier // reorder, multiply with g2 for(is=0;is<L5;is++) { for(ix=threadid; ix<VOLUME; ix+=g_num_threads) { iy = lexic2eot_5d(is, ix); iix = is*VOLUME + ix; _fv_eq_fv(g_spinor_field[0]+_GSI(iix), g_spinor_field[1]+_GSI(iy)); }} #pragma omp barrier if(rotate_gamma_basis) { for(ix=threadid; ix<VOLUME*L5; ix+=g_num_threads) { _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix)); } } else { for(ix=threadid; ix<VOLUME*L5;ix+=g_num_threads) { _fv_eq_fv(g_spinor_field[1]+_GSI(ix), g_spinor_field[0]+_GSI(ix)); } } } // end of parallel region if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] inversion done in %e seconds\n", retime-ratime); #ifdef MPI xchange_field_5d(g_spinor_field[1]); #endif /*********************************************** * check residuum ***********************************************/ if(check_residuum && dummy_flag>0) { // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg, // which uses the tmLQCD conventions (same as in contractions) // without explicit boundary conditions #ifdef MPI xchange_field_5d(g_spinor_field[2]); #endif memset(g_spinor_field[0], 0, 24*(VOLUME+RAND)*L5*sizeof(double)); //sprintf(filename, "%s.inverted.ascii.%.2d", source_filename, g_cart_id); //ofs = fopen(filename, "w"); //printf_spinor_field_5d(g_spinor_field[1], ofs); //fclose(ofs); Q_DW_Wilson_phi(g_spinor_field[0], g_spinor_field[1]); for(ix=0;ix<VOLUME*L5;ix++) { _fv_mi_eq_fv(g_spinor_field[0]+_GSI(ix), g_spinor_field[2]+_GSI(ix)); } spinor_scalar_product_re(&norm, g_spinor_field[0], g_spinor_field[0], VOLUME*L5); spinor_scalar_product_re(&norm2, g_spinor_field[2], g_spinor_field[2], VOLUME*L5); if(g_cart_id==0) fprintf(stdout, "\n# [invert_dw_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) ); } /*********************************************** * create 4-dim. propagator ***********************************************/ if(convert_sign == 0) { spinor_5d_to_4d(g_spinor_field[1], g_spinor_field[1]); } else if(convert_sign == -1 || convert_sign == +1) { spinor_5d_to_4d_sign(g_spinor_field[1], g_spinor_field[1], convert_sign); } /*********************************************** * write the solution ***********************************************/ sprintf(filename, "%s.inverted", source_filename_write); if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] writing propagator to file %s\n", filename); check_error(write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision), "write_propagator", NULL, 22); //sprintf(filename, "prop.ascii.4d.%.2d.%.2d.%.2d", isc, g_nproc, g_cart_id); //ofs = fopen(filename, "w"); //printf_spinor_field(g_spinor_field[1], ofs); //fclose(ofs); } // of last inversion #endif // of if 0 /*********************************************** * free the allocated memory, finalize ***********************************************/ #ifdef HAVE_QUDA // finalize the QUDA library if(g_cart_id==0) fprintf(stdout, "# [invert_dw_quda] finalizing quda\n"); #ifdef MPI freeGaugeQuda(); #endif endQuda(); #endif if(g_gauge_field != NULL) free(g_gauge_field); if(gauge_field_smeared != NULL) free(gauge_field_smeared); if(no_fields>0) { if(g_spinor_field!=NULL) { for(i=0; i<no_fields; i++) if(g_spinor_field[i]!=NULL) free(g_spinor_field[i]); free(g_spinor_field); } } free_geometry(); if(g_source_momentum_set && full_orbit) { finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep); if(qlatt_map != NULL) { free(qlatt_map[0]); free(qlatt_map); } } if(source_momentum != NULL) free(source_momentum); if(lck != NULL) free(lck); #ifdef MPI #ifdef HAVE_QUDA endCommsQuda(); #else MPI_Finalize(); #endif #endif if(g_cart_id==0) { g_the_time = time(NULL); fprintf(stdout, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time)); fprintf(stderr, "\n# [invert_dw_quda] %s# [invert_dw_quda] end of run\n", ctime(&g_the_time)); } return(0); }
int main(int argc, char **argv) { int c, i, mu, status; int ispin, icol, isc; int n_c = 3; int n_s = 4; int count = 0; int filename_set = 0; int dims[4] = {0,0,0,0}; int l_LX_at, l_LXstart_at; int x0, x1, x2, x3, ix, iix, iy; int sl0, sl1, sl2, sl3, have_source_flag=0; int source_proc_coords[4], lsl0, lsl1, lsl2, lsl3, source_proc_id; int check_residuum = 0; unsigned int VOL3; int do_gt = 0; int full_orbit = 0; char filename[200], source_filename[200]; double ratime, retime; double plaq_r=0., plaq_m=0., norm, norm2; // double spinor1[24], spinor2[24]; double *gauge_qdp[4], *gauge_field_timeslice=NULL, *gauge_field_smeared=NULL; double _1_2_kappa, _2_kappa, phase; FILE *ofs; int mu_trans[4] = {3, 0, 1, 2}; int threadid, nthreads; int timeslice; char rng_file_in[100], rng_file_out[100]; int *source_momentum=NULL; int source_momentum_class = -1; int source_momentum_no = 0; int source_momentum_runs = 1; int imom; /************************************************/ int qlatt_nclass; int *qlatt_id=NULL, *qlatt_count=NULL, **qlatt_rep=NULL, **qlatt_map=NULL; double **qlatt_list=NULL; /************************************************/ /*********************************************** * QUDA parameters ***********************************************/ QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); #ifdef MPI MPI_Init(&argc, &argv); #endif while ((c = getopt(argc, argv, "och?vgf:p:")) != -1) { switch (c) { case 'v': g_verbose = 1; break; case 'g': do_gt = 1; break; case 'f': strcpy(filename, optarg); filename_set=1; break; case 'c': check_residuum = 1; fprintf(stdout, "# [invert_quda] will check residuum again\n"); break; case 'p': n_c = atoi(optarg); fprintf(stdout, "# [invert_quda] will use number of colors = %d\n", n_c); break; case 'o': full_orbit = 1; fprintf(stdout, "# [invert_quda] will invert for full orbit, if source momentum set\n"); break; case 'h': case '?': default: usage(); break; } } // get the time stamp g_the_time = time(NULL); /************************************** * set the default values, read input **************************************/ if(filename_set==0) strcpy(filename, "cvc.input"); if(g_proc_id==0) fprintf(stdout, "# Reading input from file %s\n", filename); read_input_parser(filename); /* some checks on the input data */ if((T_global == 0) || (LX==0) || (LY==0) || (LZ==0)) { if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, T and L's must be set\n"); usage(); } if(g_kappa == 0.) { if(g_proc_id==0) fprintf(stderr, "[invert_quda] Error, kappa should be > 0.n"); usage(); } // set number of openmp threads #ifdef OPENMP omp_set_num_threads(g_num_threads); #else fprintf(stdout, "[invert_quda_cg] Warning, resetting global number of threads to 1\n"); g_num_threads = 1; #endif /* initialize MPI parameters */ mpi_init(argc, argv); // the volume of a timeslice VOL3 = LX*LY*LZ; fprintf(stdout, "# [%2d] parameters:\n"\ "# [%2d] T = %3d\n"\ "# [%2d] Tstart = %3d\n",\ g_cart_id, g_cart_id, T, g_cart_id, Tstart); #ifdef MPI if(T==0) { fprintf(stderr, "[%2d] local T is zero; exit\n", g_cart_id); MPI_Abort(MPI_COMM_WORLD, 1); MPI_Finalize(); exit(2); } #endif if(init_geometry() != 0) { fprintf(stderr, "ERROR from init_geometry\n"); #ifdef MPI MPI_Abort(MPI_COMM_WORLD, 1); MPI_Finalize(); #endif exit(1); } geometry(); /************************************** * initialize the QUDA library **************************************/ fprintf(stdout, "# [invert_quda] initializing quda\n"); initQuda(g_gpu_device_number); /************************************** * prepare the gauge field **************************************/ // read the gauge field from file alloc_gauge_field(&g_gauge_field, VOLUMEPLUSRAND); if(strcmp( gaugefilename_prefix, "identity")==0 ) { if(g_cart_id==0) fprintf(stdout, "# [invert_quda] Setting up unit gauge field\n"); for(ix=0;ix<VOLUME; ix++) { for(mu=0;mu<4;mu++) { _cm_eq_id(g_gauge_field+_GGI(ix,mu)); } } } else { if(g_gauge_file_format == 0) { // ILDG sprintf(filename, "%s.%.4d", gaugefilename_prefix, Nconf); if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename); status = read_lime_gauge_field_doubleprec(filename); } else if(g_gauge_file_format == 1) { // NERSC sprintf(filename, "%s.%.5d", gaugefilename_prefix, Nconf); if(g_cart_id==0) fprintf(stdout, "# Reading gauge field from file %s\n", filename); status = read_nersc_gauge_field(g_gauge_field, filename, &plaq_r); } if(status != 0) { fprintf(stderr, "[invert_quda] Error, could not read gauge field"); #ifdef MPI MPI_Abort(MPI_COMM_WORLD, 12); MPI_Finalize(); #endif exit(12); } } #ifdef MPI xchange_gauge(); #endif // measure the plaquette plaquette(&plaq_m); if(g_cart_id==0) fprintf(stdout, "# Measured plaquette value: %25.16e\n", plaq_m); if(g_cart_id==0) fprintf(stdout, "# Read plaquette value : %25.16e\n", plaq_r); // allocate the smeared / qdp ordered gauge field alloc_gauge_field(&gauge_field_smeared, VOLUME); for(i=0;i<4;i++) { gauge_qdp[i] = gauge_field_smeared + i*18*VOLUME; } // transcribe the gauge field #ifdef OPENMP omp_set_num_threads(g_num_threads); #pragma omp parallel for private(ix,iy,mu) #endif for(ix=0;ix<VOLUME;ix++) { iy = g_lexic2eot[ix]; for(mu=0;mu<4;mu++) { _cm_eq_cm(gauge_qdp[mu_trans[mu]]+18*iy, g_gauge_field+_GGI(ix,mu)); } } // multiply timeslice T-1 with factor of -1 (antiperiodic boundary condition) #ifdef OPENMP omp_set_num_threads(g_num_threads); #pragma omp parallel for private(ix,iy) #endif for(ix=0;ix<VOL3;ix++) { iix = (T-1)*VOL3 + ix; iy = g_lexic2eot[iix]; _cm_ti_eq_re(gauge_qdp[mu_trans[0]]+18*iy, -1.); } // QUDA gauge parameters gauge_param.X[0] = LX_global; gauge_param.X[1] = LY_global; gauge_param.X[2] = LZ_global; gauge_param.X[3] = T_global; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = QUDA_RECONSTRUCT_12; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = QUDA_RECONSTRUCT_12; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; gauge_param.ga_pad = 0; // load the gauge field fprintf(stdout, "# [invert_quda] loading gauge field\n"); loadGaugeQuda((void*)gauge_qdp, &gauge_param); gauge_qdp[0] = NULL; gauge_qdp[1] = NULL; gauge_qdp[2] = NULL; gauge_qdp[3] = NULL; /********************************************* * APE smear the gauge field *********************************************/ memcpy(gauge_field_smeared, g_gauge_field, 72*VOLUME*sizeof(double)); if(N_ape>0) { fprintf(stdout, "# [invert_quda] APE smearing gauge field with paramters N_APE=%d, alpha_APE=%e\n", N_ape, alpha_ape); #ifdef OPENMP APE_Smearing_Step_threads(gauge_field_smeared, N_ape, alpha_ape); #else for(i=0; i<N_ape; i++) { APE_Smearing_Step(gauge_field_smeared, alpha_ape); } #endif } /* allocate memory for the spinor fields */ no_fields = 3; g_spinor_field = (double**)calloc(no_fields, sizeof(double*)); for(i=0; i<no_fields; i++) alloc_spinor_field(&g_spinor_field[i], VOLUMEPLUSRAND); /* the source locaton */ sl0 = g_source_location / (LX_global*LY_global*LZ); sl1 = ( g_source_location % (LX_global*LY_global*LZ) ) / ( LY_global*LZ); sl2 = ( g_source_location % ( LY_global*LZ) ) / ( LZ); sl3 = g_source_location % LZ; if(g_cart_id==0) fprintf(stdout, "# [invert_quda] global sl = (%d, %d, %d, %d)\n", sl0, sl1, sl2, sl3); source_proc_coords[0] = sl0 / T; source_proc_coords[1] = sl1 / LX; source_proc_coords[2] = sl2 / LY; source_proc_coords[3] = sl3 / LZ; #ifdef MPI MPI_Cart_rank(g_cart_grid, source_proc_coords, &source_proc_id); #else source_proc_id = 0; #endif have_source_flag = source_proc_id == g_cart_id; lsl0 = sl0 % T; lsl1 = sl1 % LX; lsl2 = sl2 % LY; lsl3 = sl3 % LZ; if(have_source_flag) { fprintf(stdout, "# [invert_quda] process %d has the source at (%d, %d, %d, %d)\n", g_cart_id, lsl0, lsl1, lsl2, lsl3); } // QUDA inverter parameters inv_param.dslash_type = QUDA_WILSON_DSLASH; // inv_param.inv_type = QUDA_BICGSTAB_INVERTER; inv_param.inv_type = QUDA_CG_INVERTER; inv_param.kappa = g_kappa; inv_param.tol = solver_precision; inv_param.maxiter = niter_max; inv_param.reliable_delta = reliable_delta; inv_param.solution_type = QUDA_MAT_SOLUTION; // inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; // QUDA_MATPC_EVEN_EVEN; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //;QUDA_MASS_NORMALIZATION; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.sp_pad = 0; inv_param.cl_pad = 0; inv_param.verbosity = QUDA_VERBOSE; // write initial rng state to file if(g_source_type==2 && g_coherent_source==2) { sprintf(rng_file_out, "%s.0", g_rng_filename); if( init_rng_stat_file (g_seed, rng_file_out) != 0 ) { fprintf(stderr, "[invert_quda] Error, could not write rng status\n"); exit(210); } } else if(g_source_type==3 || g_source_type==4) { if( init_rng_state(g_seed, &g_rng_state) != 0 ) { fprintf(stderr, "[invert_quda] Error, could initialize rng state\n"); exit(211); } } // check the source momenta if(g_source_momentum_set) { source_momentum = (int*)malloc(3*sizeof(int)); if(g_source_momentum[0]<0) g_source_momentum[0] += LX; if(g_source_momentum[1]<0) g_source_momentum[1] += LY; if(g_source_momentum[2]<0) g_source_momentum[2] += LZ; fprintf(stdout, "# [invert_quda] using final source momentum ( %d, %d, %d )\n", g_source_momentum[0], g_source_momentum[1], g_source_momentum[2]); if(full_orbit) { status = make_qcont_orbits_3d_parity_avg( &qlatt_id, &qlatt_count, &qlatt_list, &qlatt_nclass, &qlatt_rep, &qlatt_map); if(status != 0) { fprintf(stderr, "\n[invert_quda] Error while creating O_3-lists\n"); exit(4); } source_momentum_class = qlatt_id[g_ipt[0][g_source_momentum[0]][g_source_momentum[1]][g_source_momentum[2]]]; source_momentum_no = qlatt_count[source_momentum_class]; source_momentum_runs = source_momentum_class==0 ? 1 : source_momentum_no + 1; fprintf(stdout, "# [] source momentum belongs to class %d with %d members, which means %d runs\n", source_momentum_class, source_momentum_no, source_momentum_runs); } } /*********************************************** * loop on spin-color-index ***********************************************/ for(isc=g_source_index[0]; isc<=g_source_index[1]; isc++) { ispin = isc / n_c; icol = isc % n_c; for(imom=0; imom<source_momentum_runs; imom++) { /*********************************************** * set source momentum ***********************************************/ if(g_source_momentum_set) { if(imom == 0) { if(full_orbit) { source_momentum[0] = 0; source_momentum[1] = 0; source_momentum[2] = 0; } else { source_momentum[0] = g_source_momentum[0]; source_momentum[1] = g_source_momentum[1]; source_momentum[2] = g_source_momentum[2]; } } else { source_momentum[0] = qlatt_map[source_momentum_class][imom-1] / (LY*LZ); source_momentum[1] = ( qlatt_map[source_momentum_class][imom-1] % (LY*LZ) ) / LZ; source_momentum[2] = qlatt_map[source_momentum_class][imom-1] % LZ; } fprintf(stdout, "# [] run no. %d, source momentum (%d, %d, %d)\n", imom, source_momentum[0], source_momentum[1], source_momentum[2]); } /*********************************************** * prepare the souce ***********************************************/ if(g_read_source == 0) { // create source switch(g_source_type) { case 0: // point source fprintf(stdout, "# [invert_quda] Creating point source\n"); for(ix=0;ix<24*VOLUME;ix++) g_spinor_field[0][ix] = 0.; if(have_source_flag) { if(g_source_momentum_set) { phase = 2*M_PI*( source_momentum[0]*lsl1/(double)LX + source_momentum[1]*lsl2/(double)LY + source_momentum[2]*lsl3/(double)LZ ); g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol) ] = cos(phase); g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol)+1] = sin(phase); } else { g_spinor_field[0][_GSI(g_source_location) + 2*(n_c*ispin+icol) ] = 1.; } } if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix, Nconf, sl0, sl1, sl2, sl3, n_c*ispin+icol); } break; case 2: // timeslice source if(g_coherent_source==1) { fprintf(stdout, "# [invert_quda] Creating coherent timeslice source\n"); status = prepare_coherent_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_coherent_source_base, g_coherent_source_delta, VOLUME, g_rng_filename, NULL); if(status != 0) { fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status); exit(123); } timeslice = g_coherent_source_base; } else { if(g_coherent_source==2) { strcpy(rng_file_in, rng_file_out); if(isc == g_source_index[1]) { strcpy(rng_file_out, g_rng_filename); } else { sprintf(rng_file_out, "%s.%d", g_rng_filename, isc+1); } timeslice = (g_coherent_source_base+isc*g_coherent_source_delta)%T_global; fprintf(stdout, "# [invert_quda] Creating timeslice source\n"); status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, timeslice, VOLUME, rng_file_in, rng_file_out); if(status != 0) { fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status); exit(123); } } else { fprintf(stdout, "# [invert_quda] Creating timeslice source\n"); status = prepare_timeslice_source(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, VOLUME, g_rng_filename, g_rng_filename); if(status != 0) { fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status); exit(124); } timeslice = g_source_timeslice; } } if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix, Nconf, timeslice, isc); } break; case 3: // timeslice sources for one-end trick (spin dilution) fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n"); status = prepare_timeslice_source_one_end(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum, isc%n_s, g_rng_state, \ ( isc%n_s==(n_s-1) && imom==source_momentum_runs-1 ) ); if(status != 0) { fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status); exit(125); } c = N_Jacobi > 0 ? isc%n_s + n_s : isc%n_s; if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c); } break; case 4: // timeslice sources for one-end trick (spin and color dilution ) fprintf(stdout, "# [invert_quda] Creating timeslice source for one-end-trick\n"); status = prepare_timeslice_source_one_end_color(g_spinor_field[0], gauge_field_smeared, g_source_timeslice, source_momentum,\ isc%(n_s*n_c), g_rng_state, ( isc%(n_s*n_c)==(n_s*n_c-1) && imom==source_momentum_runs-1 ) ); if(status != 0) { fprintf(stderr, "[invert_quda] Error from prepare source, status was %d\n", status); exit(126); } c = N_Jacobi > 0 ? isc%(n_s*n_c) + (n_s*n_c) : isc%(n_s*n_c); if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.2d.qx%.2dqy%.2dqz%.2d", filename_prefix, Nconf, g_source_timeslice, c, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.2d", filename_prefix, Nconf, g_source_timeslice, c); } break; default: fprintf(stderr, "\nError, unrecognized source type\n"); exit(32); break; } } else { // read source switch(g_source_type) { case 0: // point source if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d.qx%.2dqy%.2dqz%.2d", \ filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.t%.2dx%.2dy%.2dz%.2d.%.2d", filename_prefix2, Nconf, sl0, sl1, sl2, sl3, isc); } fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename); status = read_lime_spinor(g_spinor_field[0], source_filename, 0); if(status != 0) { fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename); exit(115); } break; case 2: // timeslice source if(g_source_momentum_set) { sprintf(source_filename, "%s.%.4d.%.2d.%.5d.qx%.2dqy%.2dqz%.2d", filename_prefix2, Nconf, g_source_timeslice, isc, source_momentum[0], source_momentum[1], source_momentum[2]); } else { sprintf(source_filename, "%s.%.4d.%.2d.%.5d", filename_prefix2, Nconf, g_source_timeslice, isc); } fprintf(stdout, "# [invert_quda] reading source from file %s\n", source_filename); status = read_lime_spinor(g_spinor_field[0], source_filename, 0); if(status != 0) { fprintf(stderr, "# [invert_quda] Errro, could not read source from file %s\n", source_filename); exit(115); } break; default: fprintf(stderr, "[] Error, unrecognized source type for reading\n"); exit(104); break; } } // of if g_read_source //sprintf(filename, "%s.ascii", source_filename); //ofs = fopen(filename, "w"); //printf_spinor_field(g_spinor_field[0], ofs); //fclose(ofs); if(g_write_source) { status = write_propagator(g_spinor_field[0], source_filename, 0, g_propagator_precision); if(status != 0) { fprintf(stderr, "Error from write_propagator, status was %d\n", status); exit(27); } } // smearing if(N_Jacobi > 0) { #ifdef OPENMP Jacobi_Smearing_Step_one_threads(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], N_Jacobi, kappa_Jacobi); #else for(c=0; c<N_Jacobi; c++) { Jacobi_Smearing_Step_one(gauge_field_smeared, g_spinor_field[0], g_spinor_field[1], kappa_Jacobi); } #endif } // multiply with g2 for(ix=0;ix<VOLUME;ix++) { _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[0]+_GSI(ix)); } // transcribe the spinor field to even-odd ordering with coordinates (x,y,z,t) for(ix=0;ix<VOLUME;ix++) { iy = g_lexic2eot[ix]; _fv_eq_fv(g_spinor_field[2]+_GSI(iy), g_spinor_field[1]+_GSI(ix)); } /*********************************************** * perform the inversion ***********************************************/ fprintf(stdout, "# [invert_quda] starting inversion\n"); ratime = (double)clock() / CLOCKS_PER_SEC; for(ix=0;ix<VOLUME;ix++) { _fv_eq_zero(g_spinor_field[1]+_GSI(ix) ); } invertQuda(g_spinor_field[1], g_spinor_field[2], &inv_param); retime = (double)clock() / CLOCKS_PER_SEC; fprintf(stdout, "# [invert_quda] inversion done in %e seconds\n", retime-ratime); fprintf(stdout, "# [invert_quda] Device memory used:\n\tSpinor: %f GiB\n\tGauge: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); if(inv_param.mass_normalization == QUDA_KAPPA_NORMALIZATION) { _2_kappa = 2. * g_kappa; for(ix=0;ix<VOLUME;ix++) { _fv_ti_eq_re(g_spinor_field[1]+_GSI(ix), _2_kappa ); } } // transcribe the spinor field to lexicographical order with (t,x,y,z) for(ix=0;ix<VOLUME;ix++) { iy = g_lexic2eot[ix]; _fv_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[1]+_GSI(iy)); } // multiply with g2 for(ix=0;ix<VOLUME;ix++) { _fv_eq_gamma_ti_fv(g_spinor_field[1]+_GSI(ix), 2, g_spinor_field[2]+_GSI(ix)); } /*********************************************** * check residuum ***********************************************/ if(check_residuum) { // apply the Wilson Dirac operator in the gamma-basis defined in cvc_linalg, // which uses the tmLQCD conventions (same as in contractions) // without explicit boundary conditions Q_Wilson_phi(g_spinor_field[2], g_spinor_field[1]); for(ix=0;ix<VOLUME;ix++) { _fv_mi_eq_fv(g_spinor_field[2]+_GSI(ix), g_spinor_field[0]+_GSI(ix)); } spinor_scalar_product_re(&norm, g_spinor_field[2], g_spinor_field[2], VOLUME); spinor_scalar_product_re(&norm2, g_spinor_field[0], g_spinor_field[0], VOLUME); fprintf(stdout, "\n# [invert_quda] absolut residuum squared: %e; relative residuum %e\n", norm, sqrt(norm/norm2) ); } /*********************************************** * write the solution ***********************************************/ sprintf(filename, "%s.inverted", source_filename); fprintf(stdout, "# [invert_quda] writing propagator to file %s\n", filename); status = write_propagator(g_spinor_field[1], filename, 0, g_propagator_precision); if(status != 0) { fprintf(stderr, "Error from write_propagator, status was %d\n", status); exit(22); } } // of loop on momenta } // of isc /*********************************************** * free the allocated memory, finalize ***********************************************/ // finalize the QUDA library fprintf(stdout, "# [invert_quda] finalizing quda\n"); endQuda(); free(g_gauge_field); free(gauge_field_smeared); for(i=0; i<no_fields; i++) free(g_spinor_field[i]); free(g_spinor_field); free_geometry(); if(g_source_momentum_set && full_orbit) { finalize_q_orbits(&qlatt_id, &qlatt_count, &qlatt_list, &qlatt_rep); if(qlatt_map != NULL) { free(qlatt_map[0]); free(qlatt_map); } } if(source_momentum != NULL) free(source_momentum); #ifdef MPI MPI_Finalize(); #endif if(g_cart_id==0) { g_the_time = time(NULL); fprintf(stdout, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time)); fprintf(stderr, "\n# [invert_quda] %s# [invert_quda] end of run\n", ctime(&g_the_time)); } return(0); }
void init() { gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gauge_param.X[0] = 12; gauge_param.X[1] = 12; gauge_param.X[2] = 12; gauge_param.X[3] = 12; setDims(gauge_param.X, Ls); gauge_param.anisotropy = 2.3; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = QUDA_RECONSTRUCT_12; gauge_param.reconstruct_sloppy = gauge_param.reconstruct; gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; gauge_param.type = QUDA_WILSON_LINKS; inv_param.inv_type = QUDA_CG_INVERTER; inv_param.mass = 0.01; inv_param.m5 = -1.5; kappa5 = 0.5/(5 + inv_param.m5); inv_param.Ls = Ls; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = dagger; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; gauge_param.ga_pad = 0; inv_param.sp_pad = 0; inv_param.cl_pad = 0; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) { inv_param.solution_type = QUDA_MAT_SOLUTION; } else { inv_param.solution_type = QUDA_MATPC_SOLUTION; } inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH; inv_param.verbosity = QUDA_VERBOSE; // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor = 3; csParam.nSpin = 4; csParam.nDim = 5; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; csParam.x[4] = Ls; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gauge_param.X[0]; printfQuda("Randomizing fields... "); construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); spinor->Source(QUDA_RANDOM_SOURCE); printfQuda("done.\n"); fflush(stdout); int dev = 0; initQuda(dev); printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (!transfer) { csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (csParam.precision == QUDA_DOUBLE_PRECISION ) { csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; } else { /* Single and half */ csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; } if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); if (test_type == 2) csParam.x[0] /= 2; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp = new cudaColorSpinorField(csParam); printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << norm2(*cudaSpinor) << std::endl; bool pc = (test_type != 2); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_DEBUG_VERBOSE; diracParam.tmp1 = tmp; diracParam.tmp2 = tmp2; dirac = Dirac::create(diracParam); } else { std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; } }
static void llfat_test(int test) { QudaGaugeParam qudaGaugeParam; #ifdef MULTI_GPU void* ghost_sitelink[4]; void* ghost_sitelink_diag[16]; #endif initQuda(device); cpu_prec = prec; gSize = cpu_prec; qudaGaugeParam = newQudaGaugeParam(); qudaGaugeParam.anisotropy = 1.0; qudaGaugeParam.X[0] = xdim; qudaGaugeParam.X[1] = ydim; qudaGaugeParam.X[2] = zdim; qudaGaugeParam.X[3] = tdim; setDims(qudaGaugeParam.X); qudaGaugeParam.cpu_prec = cpu_prec; qudaGaugeParam.cuda_prec = prec; qudaGaugeParam.gauge_order = gauge_order; qudaGaugeParam.type=QUDA_WILSON_LINKS; qudaGaugeParam.reconstruct = link_recon; /* qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE | QUDA_FAT_PRESERVE_GPU_GAUGE | QUDA_FAT_PRESERVE_COMM_MEM; */ qudaGaugeParam.preserve_gauge =0; void* fatlink; if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for fatlink\n"); } void* longlink; if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for longlink\n"); } // page-locked memory void* sitelink[4]; for(int i=0;i < 4;i++){ if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for sitelink\n"); } } void* sitelink_ex[4]; for(int i=0;i < 4;i++){ if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) { errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n"); } } void* milc_sitelink; milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize); if(milc_sitelink == NULL){ errorQuda("ERROR: allocating milc_sitelink failed\n"); } void* milc_sitelink_ex; milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize); if(milc_sitelink_ex == NULL){ errorQuda("Error: allocating milc_sitelink failed\n"); } createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1); if(gauge_order == QUDA_MILC_GAUGE_ORDER){ for(int i=0; i<V; ++i){ for(int dir=0; dir<4; ++dir){ char* src = (char*)sitelink[dir]; memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize); } } } int X1=Z[0]; int X2=Z[1]; int X3=Z[2]; int X4=Z[3]; for(int i=0; i < V_ex; i++){ int sid = i; int oddBit=0; if(i >= Vh_ex){ sid = i - Vh_ex; oddBit = 1; } int za = sid/E1h; int x1h = sid - za*E1h; int zb = za/E2; int x2 = za - zb*E2; int x4 = zb/E3; int x3 = zb - x4*E3; int x1odd = (x2 + x3 + x4 + oddBit) & 1; int x1 = 2*x1h + x1odd; if( x1< 2 || x1 >= X1 +2 || x2< 2 || x2 >= X2 +2 || x3< 2 || x3 >= X3 +2 || x4< 2 || x4 >= X4 +2){ #ifdef MULTI_GPU continue; #endif } x1 = (x1 - 2 + X1) % X1; x2 = (x2 - 2 + X2) % X2; x3 = (x3 - 2 + X3) % X3; x4 = (x4 - 2 + X4) % X4; int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1; if(oddBit){ idx += Vh; } for(int dir= 0; dir < 4; dir++){ char* src = (char*)sitelink[dir]; char* dst = (char*)sitelink_ex[dir]; memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); // milc ordering memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize); }//dir }//i double act_path_coeff[6]; for(int i=0;i < 6;i++){ act_path_coeff[i]= 0.1*i; } //only record the last call's performance //the first one is for creating the cpu/cuda data structures struct timeval t0, t1; void** sitelink_ptr; QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD; if(gauge_order == QUDA_QDP_GAUGE_ORDER){ sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink; }else{ sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink; } void* longlink_ptr = longlink; #ifdef MULTI_GPU if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation #endif gettimeofday(&t0, NULL); computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method); gettimeofday(&t1, NULL); double secs = TDIFF(t0,t1); void* fat_reflink[4]; void* long_reflink[4]; for(int i=0;i < 4;i++){ fat_reflink[i] = malloc(V*gaugeSiteSize*gSize); if(fat_reflink[i] == NULL){ errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i); } long_reflink[i] = malloc(V*gaugeSiteSize*gSize); if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i); } if (verify_results){ //FIXME: we have this compplication because references takes coeff as float/double // depending on the precision while the GPU code aways take coeff as double void* coeff; double coeff_dp[6]; float coeff_sp[6]; for(int i=0;i < 6;i++){ coeff_sp[i] = coeff_dp[i] = act_path_coeff[i]; } if(prec == QUDA_DOUBLE_PRECISION){ coeff = coeff_dp; }else{ coeff = coeff_sp; } #ifdef MULTI_GPU int optflag = 0; //we need x,y,z site links in the back and forward T slice // so it is 3*2*Vs_t int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t}; for(int i=0;i < 4; i++){ ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize); if (ghost_sitelink[i] == NULL){ printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i); exit(1); } } /* nu | | |_____| mu */ for(int nu=0;nu < 4;nu++){ for(int mu=0; mu < 4;mu++){ if(nu == mu){ ghost_sitelink_diag[nu*4+mu] = NULL; }else{ //the other directions int dir1, dir2; for(dir1= 0; dir1 < 4; dir1++){ if(dir1 !=nu && dir1 != mu){ break; } } for(dir2=0; dir2 < 4; dir2++){ if(dir2 != nu && dir2 != mu && dir2 != dir1){ break; } } ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); if(ghost_sitelink_diag[nu*4+mu] == NULL){ errorQuda("malloc failed for ghost_sitelink_diag\n"); } memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize); } } } exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag); llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff); { int R[4] = {2,2,2,2}; exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4); computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff); } #else llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff); #endif }//verify_results //format change for fatlink and longlink void* myfatlink[4]; void* mylonglink[4]; for(int i=0;i < 4;i++){ myfatlink[i] = malloc(V*gaugeSiteSize*gSize); if(myfatlink[i] == NULL){ printf("Error: malloc failed for myfatlink[%d]\n", i); exit(1); } mylonglink[i] = malloc(V*gaugeSiteSize*gSize); if(mylonglink[i] == NULL){ printf("Error: malloc failed for mylonglink[%d]\n", i); exit(1); } memset(myfatlink[i], 0, V*gaugeSiteSize*gSize); memset(mylonglink[i], 0, V*gaugeSiteSize*gSize); } for(int i=0;i < V; i++){ for(int dir=0; dir< 4; dir++){ char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize; char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize; memcpy(dst, src, gaugeSiteSize*gSize); src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize; dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize; memcpy(dst, src, gaugeSiteSize*gSize); } } if (verify_results) { printfQuda("Checking fat links...\n"); int res=1; for(int dir=0; dir<4; dir++){ res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); } strong_check_link(myfatlink, "GPU results: ", fat_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED"); #ifdef MULTI_GPU if(test){ #endif printfQuda("Checking long links...\n"); res = 1; for(int dir=0; dir<4; ++dir){ res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec); } strong_check_link(mylonglink, "GPU results: ", long_reflink, "CPU reference results:", V, qudaGaugeParam.cpu_prec); printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED"); #ifdef MULTI_GPU }else{ // !test printfQuda("Extended volume is required for multi-GPU long-link construction\n"); } #endif } int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3]; int flops= 61632; #ifdef MULTI_GPU if(test) flops += (252*4); // long-link contribution #else flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale) #endif double perf = 1.0* flops*volume/(secs*1024*1024*1024); printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf); for(int i=0;i < 4;i++){ free(myfatlink[i]); } #ifdef MULTI_GPU if (verify_results){ int i; for(i=0;i < 4;i++){ free(ghost_sitelink[i]); } for(i=0;i <4; i++){ for(int j=0;j <4; j++){ if (i==j){ continue; } free(ghost_sitelink_diag[i*4+j]); } } } #endif for(int i=0;i < 4; i++){ cudaFreeHost(sitelink[i]); cudaFreeHost(sitelink_ex[i]); free(fat_reflink[i]); } cudaFreeHost(fatlink); cudaFreeHost(longlink); if(milc_sitelink) free(milc_sitelink); if(milc_sitelink_ex) free(milc_sitelink_ex); #ifdef MULTI_GPU exchange_llfat_cleanup(); #endif endQuda(); }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setSpinorSiteSize(6); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize); qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize); } fatlink = malloc(4*V*gaugeSiteSize*gSize); longlink = malloc(4*V*gaugeSiteSize*gSize); construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, &gaugeParam, dslash_type); const double cos_pi_3 = 0.5; // Cos(pi/3) const double sin_pi_3 = sqrt(0.75); // Sin(pi/3) for(int dir=0; dir<4; ++dir){ for(int i=0; i<V; ++i){ for(int j=0; j<gaugeSiteSize; ++j){ if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; }else{ ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; } } } } ColorSpinorParam csParam; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } #ifdef MULTI_GPU int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; // FIXME: currently assume staggered is SU(3) gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); ghost_longlink = (void**)cpuLong->Ghost(); gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #endif double time0 = -((double)clock()); // Start the timer double nrm2=0; double src2=0; int ret = 0; switch(test_type){ case 0: //even if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: #define NUM_OFFSETS 12 { double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205}; inv_param.num_offset = NUM_OFFSETS; // these can be set independently for (int i=0; i<inv_param.num_offset; i++) { inv_param.tol_offset[i] = inv_param.tol; inv_param.tol_hq_offset[i] = inv_param.tol_hq; } void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < inv_param.num_offset; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < inv_param.num_offset; i++){ outArray[i] = spinorOutArray[i]->V(); inv_param.offset[i] = 4*masses[i]*masses[i]; } len=Vh; if (test_type == 3) { inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else { inv_param.matpc_type = QUDA_MATPC_ODD_ODD; } invertMultiShiftQuda(outArray, in->V(), &inv_param); cudaDeviceSynchronize(); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity = QUDA_INVALID_PARITY; if (inv_param.solve_type == QUDA_NORMOP_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < inv_param.num_offset;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr); //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){ ret |=1; } } for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i]; } break; default: errorQuda("Unsupported test type"); }//switch if (test_type <=2){ double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); } end(); return ret; }
int main(int argc, char **argv) { for (int i = 1; i < argc; i++){ if(process_command_line_option(argc, argv, &i) == 0){ continue; } printfQuda("ERROR: Invalid option:%s\n", argv[i]); usage(argv); } if (prec_sloppy == QUDA_INVALID_PRECISION){ prec_sloppy = prec; } if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){ link_recon_sloppy = link_recon; } // initialize QMP/MPI, QUDA comms grid and RNG (test_util.cpp) initComms(argc, argv, gridsize_from_cmdline); display_test_info(); // *** QUDA parameters begin here. if (dslash_type != QUDA_WILSON_DSLASH && dslash_type != QUDA_CLOVER_WILSON_DSLASH && dslash_type != QUDA_TWISTED_MASS_DSLASH && dslash_type != QUDA_DOMAIN_WALL_4D_DSLASH && dslash_type != QUDA_MOBIUS_DWF_DSLASH && dslash_type != QUDA_TWISTED_CLOVER_DSLASH && dslash_type != QUDA_DOMAIN_WALL_DSLASH) { printfQuda("dslash_type %d not supported\n", dslash_type); exit(0); } QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; QudaPrecision cuda_prec = prec; QudaPrecision cuda_prec_sloppy = prec_sloppy; QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION; QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double kappa5; gauge_param.X[0] = xdim; gauge_param.X[1] = ydim; gauge_param.X[2] = zdim; gauge_param.X[3] = tdim; inv_param.Ls = 1; gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = link_recon; gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; gauge_param.reconstruct_sloppy = link_recon_sloppy; gauge_param.cuda_prec_precondition = cuda_prec_precondition; gauge_param.reconstruct_precondition = link_recon_sloppy; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.dslash_type = dslash_type; inv_param.mass = mass; inv_param.kappa = 1.0 / (2.0 * (1 + 3/gauge_param.anisotropy + mass)); if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.mu = 0.12; inv_param.epsilon = 0.1385; inv_param.twist_flavor = twist_flavor; inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) ? 2 : 1; } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { inv_param.m5 = -1.8; kappa5 = 0.5/(5 + inv_param.m5); inv_param.Ls = Lsdim; } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) { inv_param.m5 = -1.8; kappa5 = 0.5/(5 + inv_param.m5); inv_param.Ls = Lsdim; for(int k = 0; k < Lsdim; k++) { // b5[k], c[k] values are chosen for arbitrary values, // but the difference of them are same as 1.0 inv_param.b_5[k] = 1.452; inv_param.c_5[k] = 0.452; } } // offsets used only by multi-shift solver inv_param.num_offset = 4; double offset[4] = {0.01, 0.02, 0.03, 0.04}; for (int i=0; i<inv_param.num_offset; i++) inv_param.offset[i] = offset[i]; inv_param.inv_type = inv_type; if (inv_param.dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.solution_type = QUDA_MAT_SOLUTION; } else { inv_param.solution_type = multishift ? QUDA_MATPCDAG_MATPC_SOLUTION : QUDA_MATPC_SOLUTION; } inv_param.matpc_type = matpc_type; inv_param.dagger = QUDA_DAG_NO; inv_param.mass_normalization = normalization; inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH || dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH || multishift || inv_type == QUDA_CG_INVERTER) { inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; } else { inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; } inv_param.pipeline = 0; inv_param.Nsteps = 2; inv_param.gcrNkrylov = 10; inv_param.tol = 1e-7; inv_param.tol_restart = 1e-3; //now theoretical background for this parameter... #if __COMPUTE_CAPABILITY__ >= 200 // require both L2 relative and heavy quark residual to determine convergence inv_param.residual_type = static_cast<QudaResidualType>(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL); inv_param.tol_hq = 1e-3; // specify a tolerance for the residual for heavy quark residual #else // Pre Fermi architecture only supports L2 relative residual norm inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL; #endif // these can be set individually for (int i=0; i<inv_param.num_offset; i++) { inv_param.tol_offset[i] = inv_param.tol; inv_param.tol_hq_offset[i] = inv_param.tol_hq; } inv_param.maxiter = 10000; inv_param.reliable_delta = 1e-1; inv_param.use_sloppy_partial_accumulator = 0; inv_param.max_res_increase = 1; // domain decomposition preconditioner parameters inv_param.inv_type_precondition = precon_type; inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; inv_param.precondition_cycle = 1; inv_param.tol_precondition = 1e-1; inv_param.maxiter_precondition = 10; inv_param.verbosity_precondition = QUDA_SILENT; inv_param.cuda_prec_precondition = cuda_prec_precondition; inv_param.omega = 1.0; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; inv_param.cuda_prec_sloppy = cuda_prec_sloppy; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; gauge_param.ga_pad = 0; // 24*24*24/2; inv_param.sp_pad = 0; // 24*24*24/2; inv_param.cl_pad = 0; // 24*24*24/2; // For multi-GPU, ga_pad must be large enough to store a time-slice #ifdef MULTI_GPU int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; #endif if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; inv_param.clover_coeff = 1.5*inv_param.kappa; } inv_param.verbosity = QUDA_VERBOSE; // *** Everything between here and the call to initQuda() is // *** application-specific. // set parameters for the reference Dslash, and prepare fields to be loaded if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { dw_setDims(gauge_param.X, inv_param.Ls); } else { setDims(gauge_param.X); } setSpinorSiteSize(24); size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); size_t sSize = (inv_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); void *gauge[4], *clover_inv=0, *clover=0; for (int dir = 0; dir < 4; dir++) { gauge[dir] = malloc(V*gaugeSiteSize*gSize); } if (strcmp(latfile,"")) { // load in the command line supplied gauge field read_gauge_field(latfile, gauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); construct_gauge_field(gauge, 2, gauge_param.cpu_prec, &gauge_param); } else { // else generate a random SU(3) field construct_gauge_field(gauge, 1, gauge_param.cpu_prec, &gauge_param); } if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { double norm = 0.0; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal size_t cSize = (inv_param.clover_cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); clover_inv = malloc(V*cloverSiteSize*cSize); construct_clover_field(clover_inv, norm, diag, inv_param.clover_cpu_prec); // The uninverted clover term is only needed when solving the unpreconditioned // system or when using "asymmetric" even/odd preconditioning. int preconditioned = (inv_param.solve_type == QUDA_DIRECT_PC_SOLVE || inv_param.solve_type == QUDA_NORMOP_PC_SOLVE); int asymmetric = preconditioned && (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || inv_param.matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); if (!preconditioned) { clover = clover_inv; clover_inv = NULL; } else if (asymmetric) { // fake it by using the same random matrix clover = clover_inv; // for both clover and clover_inv } else { clover = NULL; } } void *spinorIn = malloc(V*spinorSiteSize*sSize*inv_param.Ls); void *spinorCheck = malloc(V*spinorSiteSize*sSize*inv_param.Ls); void *spinorOut = NULL, **spinorOutMulti = NULL; if (multishift) { spinorOutMulti = (void**)malloc(inv_param.num_offset*sizeof(void *)); for (int i=0; i<inv_param.num_offset; i++) { spinorOutMulti[i] = malloc(V*spinorSiteSize*sSize*inv_param.Ls); } } else { spinorOut = malloc(V*spinorSiteSize*sSize*inv_param.Ls); } memset(spinorIn, 0, inv_param.Ls*V*spinorSiteSize*sSize); memset(spinorCheck, 0, inv_param.Ls*V*spinorSiteSize*sSize); if (multishift) { for (int i=0; i<inv_param.num_offset; i++) memset(spinorOutMulti[i], 0, inv_param.Ls*V*spinorSiteSize*sSize); } else { memset(spinorOut, 0, inv_param.Ls*V*spinorSiteSize*sSize); } // create a point source at 0 (in each subvolume... FIXME) // create a point source at 0 (in each subvolume... FIXME) if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION) { //((float*)spinorIn)[0] = 1.0; for (int i=0; i<inv_param.Ls*V*spinorSiteSize; i++) ((float*)spinorIn)[i] = rand() / (float)RAND_MAX; } else { //((double*)spinorIn)[0] = 1.0; for (int i=0; i<inv_param.Ls*V*spinorSiteSize; i++) ((double*)spinorIn)[i] = rand() / (double)RAND_MAX; } // start the timer double time0 = -((double)clock()); // initialize the QUDA library initQuda(device); // load the gauge field loadGaugeQuda((void*)gauge, &gauge_param); // load the clover term, if desired if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) loadCloverQuda(clover, clover_inv, &inv_param); if (dslash_type == QUDA_TWISTED_CLOVER_DSLASH) loadCloverQuda(NULL, NULL, &inv_param); // perform the inversion if (multishift) { invertMultiShiftQuda(spinorOutMulti, spinorIn, &inv_param); } else { invertQuda(spinorOut, spinorIn, &inv_param); } // stop the timer time0 += clock(); time0 /= CLOCKS_PER_SEC; printfQuda("Device memory used:\n Spinor: %f GiB\n Gauge: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { printfQuda(" Clover: %f GiB\n", inv_param.cloverGiB); } printfQuda("\nDone: %i iter / %g secs = %g Gflops, total time = %g secs\n", inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs, time0); if (multishift) { if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { errorQuda("Mass normalization not supported for multi-shift solver in invert_test"); } void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls); printfQuda("Host residuum checks: \n"); for(int i=0; i < inv_param.num_offset; i++) { ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOutMulti[i], inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else { printfQuda("Domain wall not supported for multi-shift\n"); exit(-1); } axpy(inv_param.offset[i], spinorOutMulti[i], spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); mxpy(spinorIn, spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(spinorIn, Vh*spinorSiteSize, inv_param.cpu_prec); double l2r = sqrt(nrm2 / src2); printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n", i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i]); } free(spinorTmp); } else { if (inv_param.solution_type == QUDA_MAT_SOLUTION) { if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if(inv_param.twist_flavor == QUDA_TWIST_PLUS || inv_param.twist_flavor == QUDA_TWIST_MINUS) tm_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, 0, inv_param.cpu_prec, gauge_param); else { int tm_offset = V*spinorSiteSize; //12*spinorRef->Volume(); void *evenOut = spinorCheck; void *oddOut = cpu_prec == sizeof(double) ? (void*)((double*)evenOut + tm_offset): (void*)((float*)evenOut + tm_offset); void *evenIn = spinorOut; void *oddIn = cpu_prec == sizeof(double) ? (void*)((double*)evenIn + tm_offset): (void*)((float*)evenIn + tm_offset); tm_ndeg_mat(evenOut, oddOut, gauge, evenIn, oddIn, inv_param.kappa, inv_param.mu, inv_param.epsilon, 0, inv_param.cpu_prec, gauge_param); } } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_mat(spinorCheck, gauge, spinorOut, inv_param.kappa, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); // } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { // dw_4d_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); // } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) { // mdw_mat(spinorCheck, gauge, spinorOut, kappa5, inv_param.dagger, inv_param.cpu_prec, gauge_param, inv_param.mass); } else { printfQuda("Unsupported dslash_type\n"); exit(-1); } if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { ax(0.5/kappa5, spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH && twist_flavor == QUDA_TWIST_NONDEG_DOUBLET) { ax(0.5/inv_param.kappa, spinorCheck, 2*V*spinorSiteSize, inv_param.cpu_prec); } else { ax(0.5/inv_param.kappa, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); } } } else if(inv_param.solution_type == QUDA_MATPC_SOLUTION) { if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_matpc(spinorCheck, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_DOMAIN_WALL_DSLASH) { dw_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH) { dw_4d_matpc(spinorCheck, gauge, spinorOut, kappa5, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass); } else if (dslash_type == QUDA_MOBIUS_DWF_DSLASH) { double *kappa_b, *kappa_c; kappa_b = (double*)malloc(Lsdim*sizeof(double)); kappa_c = (double*)malloc(Lsdim*sizeof(double)); for(int xs = 0 ; xs < Lsdim ; xs++) { kappa_b[xs] = 1.0/(2*(inv_param.b_5[xs]*(4.0 + inv_param.m5) + 1.0)); kappa_c[xs] = 1.0/(2*(inv_param.c_5[xs]*(4.0 + inv_param.m5) - 1.0)); } mdw_matpc(spinorCheck, gauge, spinorOut, kappa_b, kappa_c, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param, inv_param.mass, inv_param.b_5, inv_param.c_5); free(kappa_b); free(kappa_c); } else { printfQuda("Unsupported dslash_type\n"); exit(-1); } if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { if (dslash_type == QUDA_DOMAIN_WALL_DSLASH || dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || dslash_type == QUDA_MOBIUS_DWF_DSLASH) { ax(0.25/(kappa5*kappa5), spinorCheck, V*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); } else { ax(0.25/(inv_param.kappa*inv_param.kappa), spinorCheck, Vh*spinorSiteSize, inv_param.cpu_prec); } } } else if (inv_param.solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) { void *spinorTmp = malloc(V*spinorSiteSize*sSize*inv_param.Ls); ax(0, spinorCheck, V*spinorSiteSize, inv_param.cpu_prec); if (dslash_type == QUDA_TWISTED_MASS_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) { if (inv_param.twist_flavor != QUDA_TWIST_MINUS && inv_param.twist_flavor != QUDA_TWIST_PLUS) errorQuda("Twisted mass solution type not supported"); tm_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); tm_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.mu, inv_param.twist_flavor, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else if (dslash_type == QUDA_WILSON_DSLASH || dslash_type == QUDA_CLOVER_WILSON_DSLASH) { wil_matpc(spinorTmp, gauge, spinorOut, inv_param.kappa, inv_param.matpc_type, 0, inv_param.cpu_prec, gauge_param); wil_matpc(spinorCheck, gauge, spinorTmp, inv_param.kappa, inv_param.matpc_type, 1, inv_param.cpu_prec, gauge_param); } else { printfQuda("Unsupported dslash_type\n"); exit(-1); } if (inv_param.mass_normalization == QUDA_MASS_NORMALIZATION) { errorQuda("Mass normalization not implemented"); } free(spinorTmp); } int vol = inv_param.solution_type == QUDA_MAT_SOLUTION ? V : Vh; mxpy(spinorIn, spinorCheck, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); double nrm2 = norm_2(spinorCheck, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); double src2 = norm_2(spinorIn, vol*spinorSiteSize*inv_param.Ls, inv_param.cpu_prec); double l2r = sqrt(nrm2 / src2); printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g\n", inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq); } freeGaugeQuda(); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_CLOVER_DSLASH) freeCloverQuda(); // finalize the QUDA library endQuda(); finalizeComms(); return 0; }