int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double mass = 0.1; set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setDimConstants(gaugeParam.X); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); for (int dir = 0; dir < 4; dir++) { for(int i = 0;i < V*gaugeSiteSize;i++){ if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; }else{ ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; } } } ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); cpuFat->exchangeGhost(); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); cpuLong->exchangeGhost(); ghost_longlink = (void**)cpuLong->Ghost(); #endif if(testtype == 6){ record_gauge(gaugeParam.X, fatlink, fat_pad, longlink, link_pad, link_recon, link_recon_sloppy, &gaugeParam); }else{ #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #endif } double time0 = -((double)clock()); // Start the timer unsigned long volume = Vh; unsigned long nflops=2*1187; //from MILC's CG routine double nrm2=0; double src2=0; int ret = 0; switch(testtype){ case 0: //even volume = Vh; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); { double sol = norm_2(out->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); double refe = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); } break; case 1: //odd volume = Vh; inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: case 5: case 6: #define NUM_OFFSETS 7 nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81}; double offsets[NUM_OFFSETS]; int num_offsets =NUM_OFFSETS; void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < num_offsets; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < num_offsets; i++){ outArray[i] = spinorOutArray[i]->V(); } for (int i=0; i< num_offsets;i++){ offsets[i] = 4*masses[i]*masses[i]; } len=Vh; volume = Vh; if (testtype == 3 || testtype == 6){ inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (testtype == 4){ inv_param.matpc_type = QUDA_MATPC_ODD_ODD; }else { //testtype ==5 errorQuda("test 5 not supported\n"); } double residue_sq; if (testtype == 6){ invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); }else{ invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); } cudaThreadSynchronize(); printfQuda("Final residue squred =%g\n", residue_sq); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity; if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < num_offsets;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret |=1; } } if (ret ==1){ errorQuda("Converge failed!\n"); } for(int i=1; i < num_offsets;i++){ delete spinorOutArray[i]; } }//switch if (testtype <=2){ printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret = 1; errorQuda("Convergence failed!\n"); } } end(); return ret; }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setSpinorSiteSize(6); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize); qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize); } fatlink = malloc(4*V*gaugeSiteSize*gSize); longlink = malloc(4*V*gaugeSiteSize*gSize); construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, &gaugeParam, dslash_type); const double cos_pi_3 = 0.5; // Cos(pi/3) const double sin_pi_3 = sqrt(0.75); // Sin(pi/3) for(int dir=0; dir<4; ++dir){ for(int i=0; i<V; ++i){ for(int j=0; j<gaugeSiteSize; ++j){ if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; }else{ ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; } } } } ColorSpinorParam csParam; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } #ifdef MULTI_GPU int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; // FIXME: currently assume staggered is SU(3) gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); ghost_longlink = (void**)cpuLong->Ghost(); gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #endif double time0 = -((double)clock()); // Start the timer double nrm2=0; double src2=0; int ret = 0; switch(test_type){ case 0: //even if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: #define NUM_OFFSETS 12 { double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205}; inv_param.num_offset = NUM_OFFSETS; // these can be set independently for (int i=0; i<inv_param.num_offset; i++) { inv_param.tol_offset[i] = inv_param.tol; inv_param.tol_hq_offset[i] = inv_param.tol_hq; } void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < inv_param.num_offset; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < inv_param.num_offset; i++){ outArray[i] = spinorOutArray[i]->V(); inv_param.offset[i] = 4*masses[i]*masses[i]; } len=Vh; if (test_type == 3) { inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else { inv_param.matpc_type = QUDA_MATPC_ODD_ODD; } invertMultiShiftQuda(outArray, in->V(), &inv_param); cudaDeviceSynchronize(); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity = QUDA_INVALID_PARITY; if (inv_param.solve_type == QUDA_NORMOP_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < inv_param.num_offset;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr); //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){ ret |=1; } } for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i]; } break; default: errorQuda("Unsupported test type"); }//switch if (test_type <=2){ double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); } end(); return ret; }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double mass = 0.95; set_params(&gaugeParam, &inv_param, sdim, sdim, sdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setDimConstants(gaugeParam.X); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); for (int dir = 0; dir < 4; dir++) { for(int i = 0;i < V*gaugeSiteSize;i++){ if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; }else{ ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; } } } #ifdef MULTI_GPU //exchange_init_dims(gaugeParam.X); int ghost_link_len[4] = { Vs_x*gaugeSiteSize*gSize, Vs_y*gaugeSiteSize*gSize, Vs_z*gaugeSiteSize*gSize, Vs_t*gaugeSiteSize*gSize }; for(int i=0;i < 4;i++){ ghost_fatlink[i] = malloc(ghost_link_len[i]); ghost_longlink[i] = malloc(3*ghost_link_len[i]); if (ghost_fatlink[i] == NULL || ghost_longlink[i] == NULL){ printf("ERROR: malloc failed for ghost fatlink or ghost longlink\n"); exit(1); } } //exchange_cpu_links4dir(fatlink, ghost_fatlink, longlink, ghost_longlink, gaugeParam.cpu_prec); void *fat_send[4], *long_send[4]; for(int i=0;i < 4;i++){ fat_send[i] = malloc(ghost_link_len[i]); long_send[i] = malloc(3*ghost_link_len[i]); } set_dim(Z); pack_ghost(fatlink, fat_send, 1, gaugeParam.cpu_prec); pack_ghost(longlink, long_send, 3, gaugeParam.cpu_prec); int dummyFace = 1; FaceBuffer faceBuf (Z, 4, 18, dummyFace, gaugeParam.cpu_prec); faceBuf.exchangeCpuLink((void**)ghost_fatlink, (void**)fat_send, 1); faceBuf.exchangeCpuLink((void**)ghost_longlink, (void**)long_send, 3); for (int i=0; i<4; i++) { free(fat_send[i]); free(long_send[i]); } #endif ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->v); }else{ constructSpinorField((double*)in->v); } #ifdef MULTI_GPU if(testtype == 6){ record_gauge(fatlink, ghost_fatlink[3], Vsh_t, longlink, ghost_longlink[3], 3*Vsh_t, link_recon, link_recon_sloppy, &gaugeParam); }else{ gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2); gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = 3*MAX(sdim*sdim*sdim/2, sdim*sdim*tdim/2); gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #endif double time0 = -((double)clock()); // Start the timer unsigned long volume = Vh; unsigned long nflops=2*1187; //from MILC's CG routine double nrm2=0; double src2=0; int ret = 0; switch(testtype){ case 0: //even volume = Vh; inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->v, in->v, &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_EVEN_PARITY); #endif mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd volume = Vh; inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->v, in->v, &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->v, fatlink, longlink, out->v, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, QUDA_ODD_PARITY); #endif mxpy(in->v, ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->v, Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: case 5: case 6: #define NUM_OFFSETS 4 nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33}; double offsets[NUM_OFFSETS]; int num_offsets =NUM_OFFSETS; void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < num_offsets; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < num_offsets; i++){ outArray[i] = spinorOutArray[i]->v; } for (int i=0; i< num_offsets;i++){ offsets[i] = 4*masses[i]*masses[i]; } len=Vh; volume = Vh; inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; if (testtype == 3){ inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (testtype == 4||testtype == 6){ inv_param.matpc_type = QUDA_MATPC_ODD_ODD; }else { //testtype ==5 errorQuda("test 5 not supported\n"); } double residue_sq; if (testtype == 6){ //invertMultiShiftQudaMixed(spinorOutArray, in->v, &inv_param, offsets, num_offsets, &residue_sq); }else{ invertMultiShiftQuda(outArray, in->v, &inv_param, offsets, num_offsets, &residue_sq); } cudaThreadSynchronize(); printfQuda("Final residue squred =%g\n", residue_sq); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity; if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < num_offsets;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, ghost_fatlink, longlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->v, fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->v, parity); #endif mxpy(in->v, ref->v, len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->v, len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->v, len*mySpinorSiteSize, inv_param.cpu_prec); printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret |=1; errorQuda("Converge failed!\n"); } } for(int i=1; i < num_offsets;i++){ delete spinorOutArray[i]; } }//switch if (testtype <=2){ printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret = 1; errorQuda("Convergence failed!\n"); } } end(); return ret; }