void DeflatedSolver::PrintSummary(const char *name, int k, const double &r2, const double &b2) { if (getVerbosity() >= QUDA_SUMMARIZE) { if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %e, true = %e, heavy-quark residual = %e\n", name, k, sqrt(r2/b2), param.true_res, param.true_res_hq); } else { printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %e, true = %e\n", name, k, sqrt(r2/b2), param.true_res); } } }
void display_test_info() { printfQuda("running the following test:\n"); printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n"); printfQuda("%s %s %s %s %s %d %d \n", get_prec_str(prec),get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), get_test_type(testtype), sdim, tdim); return ; }
void DeflatedSolver::PrintStats(const char* name, int k, const double &r2, const double &b2, const double &hq2) { if (getVerbosity() >= QUDA_VERBOSE) { if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { printfQuda("%s: %d iterations, <r,r> = %e, |r|/|b| = %e, heavy-quark residual = %e\n", name, k, r2, sqrt(r2/b2), hq2); } else { printfQuda("%s: %d iterations, <r,r> = %e, |r|/|b| = %e\n", name, k, r2, sqrt(r2/b2)); } } if (std::isnan(r2)) errorQuda("Solver appears to have diverged"); }
int main(int argc, char **argv) { init(); float spinorGiB = (float)Vh*Ls*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30); printf("\nSpinor mem: %.3f GiB\n", spinorGiB); printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); int attempts = 1; dslashRef(); for (int i=0; i<attempts; i++) { if (tune) { // warm-up run printfQuda("Tuning...\n"); setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); dslashCUDA(1); } double secs = dslashCUDA(); if (!transfer) *spinorOut = *cudaSpinorOut; // print timing information printf("%fms per loop\n", 1000*secs); unsigned long long flops = 0; if (!transfer) flops = dirac->Flops(); int spinor_floats = test_type ? 2*(9*24+24)+24 : 9*24+24; if (inv_param.cuda_prec == QUDA_HALF_PRECISION) spinor_floats += test_type ? 2*(9*2 + 2) + 2 : 9*2 + 2; // relative size of norm is twice a short int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct; printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); printfQuda("GB/s = %f\n\n", (float)Vh*Ls*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9)); if (!transfer) { std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; } else { std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; } cpuColorSpinorField::Compare(*spinorRef, *spinorOut); } end(); }
// Resets the attributes of this field if param disagrees (and is defined) void ColorSpinorField::reset(const ColorSpinorParam ¶m) { if (param.nColor != 0) nColor = param.nColor; if (param.nSpin != 0) nSpin = param.nSpin; if (param.twistFlavor != QUDA_TWIST_INVALID) twistFlavor = param.twistFlavor; if (param.precision != QUDA_INVALID_PRECISION) precision = param.precision; if (param.nDim != 0) nDim = param.nDim; volume = 1; for (int d=0; d<nDim; d++) { if (param.x[0] != 0) x[d] = param.x[d]; volume *= x[d]; } if (param.pad != 0) pad = param.pad; if (param.siteSubset == QUDA_FULL_SITE_SUBSET){ stride = volume/2 + pad; length = 2*stride*nColor*nSpin*2; } else if (param.siteSubset == QUDA_PARITY_SITE_SUBSET){ stride = volume + pad; length = stride*nColor*nSpin*2; } else { //errorQuda("SiteSubset not defined %d", param.siteSubset); //do nothing, not an error (can't remember why - need to document this sometime! ) } if (param.siteSubset != QUDA_INVALID_SITE_SUBSET) siteSubset = param.siteSubset; if (param.siteOrder != QUDA_INVALID_SITE_ORDER) siteOrder = param.siteOrder; if (param.fieldOrder != QUDA_INVALID_FIELD_ORDER) fieldOrder = param.fieldOrder; if (param.gammaBasis != QUDA_INVALID_GAMMA_BASIS) gammaBasis = param.gammaBasis; createGhostZone(); real_length = volume*nColor*nSpin*2; bytes = total_length * precision; bytes = ALIGNMENT_ADJUST(bytes); norm_bytes = total_norm_length * sizeof(float); norm_bytes = ALIGNMENT_ADJUST(norm_bytes); if (!init) errorQuda("Shouldn't be resetting a non-inited field\n"); if (verbose >= QUDA_DEBUG_VERBOSE) { printfQuda("\nPrinting out reset field\n"); std::cout << *this << std::endl; printfQuda("\n"); } }
int main(int argc, char **argv) { int i; for (i =1;i < argc; i++){ if(process_command_line_option(argc, argv, &i) == 0){ continue; } fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); usage(argv); } initCommsQuda(argc, argv, gridsize_from_cmdline, 4); display_test_info(); int ret =1; int accuracy_level = dslashTest(); printfQuda("accuracy_level =%d\n", accuracy_level); if (accuracy_level >= 1) ret = 0; //probably no error, -1 means no matching endCommsQuda(); return ret; }
void staggeredDslashRef() { #ifndef MULTI_GPU int cpu_parity = 0; #endif // compare to dslash reference implementation printfQuda("Calculating reference implementation..."); fflush(stdout); switch (test_type) { case 0: #ifdef MULTI_GPU staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec); #else cpu_parity = 0; //EVEN staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec); #endif break; case 1: #ifdef MULTI_GPU staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec); #else cpu_parity=1; //ODD staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec); #endif break; case 2: //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger, //inv_param.cpu_prec, gaugeParam.cpu_prec); break; default: errorQuda("Test type not defined"); } printfQuda("done.\n"); }
static int dslashTest() { int accuracy_level = 0; init(); int attempts = 1; for (int i=0; i<attempts; i++) { if (tune) { // warm-up run printfQuda("Tuning...\n"); setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); dslashCUDA(1); } printfQuda("Executing %d kernel loops...", loops); double secs = dslashCUDA(loops); #ifdef DSLASH_PROFILING printDslashProfile(); #endif if (!transfer) *spinorOut = *cudaSpinorOut; printfQuda("\n%fms per loop\n", 1000*secs); staggeredDslashRef(); unsigned long long flops = dirac->Flops(); int link_floats = 8*gaugeParam.reconstruct+8*18; int spinor_floats = 8*6*2 + 6; int link_float_size = prec; int spinor_float_size = 0; link_floats = test_type ? (2*link_floats) : link_floats; spinor_floats = test_type ? (2*spinor_floats) : spinor_floats; int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size; if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4; printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9)); if (!transfer) { double spinor_ref_norm2 = norm2(*spinorRef); double cuda_spinor_out_norm2 = norm2(*cudaSpinorOut); double spinor_out_norm2 = norm2(*spinorOut); printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2); } else { double spinor_ref_norm2 = norm2(*spinorRef); double spinor_out_norm2 = norm2(*spinorOut); printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2); } accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut); } end(); return accuracy_level; }
void display_test_info() { printfQuda("running the following test:\n"); printfQuda("prec recon test_type dagger S_dim T_dimension\n"); printfQuda("%s %s %d %d %d/%d/%d %d \n", get_prec_str(prec), get_recon_str(link_recon), test_type, dagger, xdim, ydim, zdim, tdim); printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", commDimPartitioned(0), commDimPartitioned(1), commDimPartitioned(2), commDimPartitioned(3)); return ; }
/* * Read tunecache from disk. */ void loadTuneCache(QudaVerbosity verbosity) { char *path; struct stat pstat; std::string cache_path, line, token; std::ifstream cache_file; std::stringstream ls; path = getenv("QUDA_RESOURCE_PATH"); if (!path) { warningQuda("Environment variable QUDA_RESOURCE_PATH is not set."); warningQuda("Caching of tuned parameters will be disabled."); return; } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) { warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path); warningQuda("Caching of tuned parameters will be disabled."); return; } else { resource_path = path; } #ifdef MULTI_GPU if (comm_rank() == 0) { #endif cache_path = resource_path; cache_path += "/tunecache.tsv"; cache_file.open(cache_path.c_str()); if (cache_file) { if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); getline(cache_file, line); ls.str(line); ls >> token; if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str()); ls >> token; if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str()); ls >> token; if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str()); if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); getline(cache_file, line); // eat the blank line if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); getline(cache_file, line); // eat the description line deserializeTuneCache(cache_file); cache_file.close(); initial_cache_size = tunecache.size(); if (verbosity >= QUDA_SUMMARIZE) { printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str()); } } else {
void usage_extra(char** argv ) { printfQuda("Extra options:\n"); printfQuda(" --tol <resid_tol> # Set residual tolerance\n"); printfQuda(" --test <0/1> # Test method\n"); printfQuda(" 0: Even even spinor CG inverter\n"); printfQuda(" 1: Odd odd spinor CG inverter\n"); printfQuda(" 3: Even even spinor multishift CG inverter\n"); printfQuda(" 4: Odd odd spinor multishift CG inverter\n"); printfQuda(" 6: Even even spinor mixed precision multishift CG inverter\n"); printfQuda(" --cpu_prec <double/single/half> # Set CPU precision\n"); return ; }
int compareSpinor(const U &u, const V &v, const int tol) { int fail_check = 16*tol; int *fail = new int[fail_check]; for (int f=0; f<fail_check; f++) fail[f] = 0; int N = u.Nspin()*u.Ncolor(); int *iter = new int[N]; for (int i=0; i<N; i++) iter[i] = 0; for (int x=0; x<u.Volume(); x++) { for (int s=0; s<u.Nspin(); s++) { for (int c=0; c<u.Ncolor(); c++) { for (int z=0; z<2; z++) { double diff = fabs(u(x,s,c,z) - v(x,s,c,z)); for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1)/(double)tol)) fail[f]++; int j = s*u.Nspin() + c; if (diff > 1e-3) iter[j]++; } } } } for (int i=0; i<N; i++) printfQuda("%d fails = %d\n", i, iter[i]); int accuracy_level =0; for (int f=0; f<fail_check; f++) { if (fail[f] == 0) accuracy_level = f+1; } for (int f=0; f<fail_check; f++) { printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)/(double)tol), fail[f], u.Volume()*N, fail[f] / (double)(u.Volume()*N)); } delete []iter; delete []fail; return accuracy_level; }
// execute kernel double dslashCUDA() { printfQuda("Executing %d kernel loops...\n", loops); fflush(stdout); if (test_type < 2) dirac->Tune(*cudaSpinorOut, *cudaSpinor, *tmp); else dirac->Tune(cudaSpinorOut->Even(), cudaSpinor->Even(), *tmp); cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < loops; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printf("with ERROR: %s\n", cudaGetErrorString(stat)); printf("done.\n\n"); return secs; }
// execute kernel double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventCreate(&end); cudaEventRecord(start, 0); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { //inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; //inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; //dslashQuda(cudaSpinorOut->V(), cudaSpinor->V(), &inv_param, parity); dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; case 3: case 4: if (transfer) { MatDagMatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->MdagM(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printfQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
void display_test_info() { printfQuda("running the following test:\n"); printfQuda("prec sloppy_prec link_recon sloppy_link_recon test_type S_dimension T_dimension\n"); printfQuda("%s %s %s %s %s %d/%d/%d %d \n", get_prec_str(prec),get_prec_str(prec_sloppy), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), get_test_type(test_type), xdim, ydim, zdim, tdim); printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), dimPartitioned(3)); return ; }
static int compareSpinor(const FloatA *u, const FloatB *v, const int volume, const int N, const int resolution) { int fail_check = 16*resolution; int *fail = new int[fail_check]; for (int f=0; f<fail_check; f++) fail[f] = 0; int *iter = new int[N]; for (int i=0; i<N; i++) iter[i] = 0; for (int i=0; i<volume; i++) { for (int j=0; j<N; j++) { int is = i*N+j; double diff = fabs(u[is]-v[is]); for (int f=0; f<fail_check; f++) if (diff > pow(10.0,-(f+1)/(double)resolution)) fail[f]++; if (diff > 1e-3) iter[j]++; } } for (int i=0; i<N; i++) printfQuda("%d fails = %d\n", i, iter[i]); int accuracy_level =0; for (int f=0; f<fail_check; f++) { if (fail[f] == 0){ accuracy_level = f; } } for (int f=0; f<fail_check; f++) { printfQuda("%e Failures: %d / %d = %e\n", pow(10.0,-(f+1)/(double)resolution), fail[f], volume*N, fail[f] / (double)(volume*N)); } delete []iter; delete []fail; return accuracy_level; }
void usage(char** argv ) { printfQuda("Usage: %s <args>\n", argv[0]); printfQuda("--prec <double/single/half> Spinor/gauge precision\n"); printfQuda("--prec_sloppy <double/single/half> Spinor/gauge sloppy precision\n"); printfQuda("--recon <8/12> Long link reconstruction type\n"); printfQuda("--test <0/1/2/3/4/5> Testing type(0=even, 1=odd, 2=full, 3=multimass even,\n" " 4=multimass odd, 5=multimass full)\n"); printfQuda("--tdim T dimension\n"); printfQuda("--sdim S dimension\n"); printfQuda("--help Print out this message\n"); exit(1); return ; }
// Dirac operator factory Dirac* Dirac::create(const DiracParam ¶m) { if (param.type == QUDA_WILSON_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracWilson operator\n"); return new DiracWilson(param); } else if (param.type == QUDA_WILSONPC_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracWilsonPC operator\n"); return new DiracWilsonPC(param); } else if (param.type == QUDA_CLOVER_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracClover operator\n"); return new DiracClover(param); } else if (param.type == QUDA_CLOVERPC_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracCloverPC operator\n"); return new DiracCloverPC(param); } else if (param.type == QUDA_DOMAIN_WALL_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracDomainWall operator\n"); return new DiracDomainWall(param); } else if (param.type == QUDA_DOMAIN_WALLPC_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracDomainWallPC operator\n"); return new DiracDomainWallPC(param); } else if (param.type == QUDA_ASQTAD_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracStaggered operator\n"); return new DiracStaggered(param); } else if (param.type == QUDA_ASQTADPC_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracStaggeredPC operator\n"); return new DiracStaggeredPC(param); } else if (param.type == QUDA_TWISTED_MASS_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracTwistedMass operator (%d flavor(s))\n", param.Ls); if (param.Ls == 1) return new DiracTwistedMass(param, 4); else return new DiracTwistedMass(param, 5); } else if (param.type == QUDA_TWISTED_MASSPC_DIRAC) { if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracTwistedMassPC operator (%d flavor(s))\n", param.Ls); if (param.Ls == 1) return new DiracTwistedMassPC(param, 4); else return new DiracTwistedMassPC(param, 5); } else { return 0; } }
int main(int argc, char** argv) { for (int i = 1; i < argc; i++) { if(process_command_line_option(argc, argv, &i) == 0){ continue; } if( strcmp(argv[i], "--cpu_prec") == 0){ if (i+1 >= argc){ usage(argv); } cpu_prec= get_prec(argv[i+1]); i++; continue; } printf("ERROR: Invalid option:%s\n", argv[i]); usage(argv); } if (prec_sloppy == QUDA_INVALID_PRECISION){ prec_sloppy = prec; } if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){ link_recon_sloppy = link_recon; } if(inv_type != QUDA_CG_INVERTER){ if(test_type != 0 && test_type != 1) errorQuda("Preconditioning is currently not supported in multi-shift solver solvers"); } // initialize QMP/MPI, QUDA comms grid and RNG (test_util.cpp) initComms(argc, argv, gridsize_from_cmdline); display_test_info(); printfQuda("dslash_type = %d\n", dslash_type); int ret = invert_test(); // finalize the communications layer finalizeComms(); return ret; }
static void display_test_info(int test) { printfQuda("running the following test:\n"); printfQuda("link_precision link_reconstruct space_dimension T_dimension Test Ordering\n"); printfQuda("%s %s %d/%d/%d/ %d %d %s \n", get_prec_str(prec), get_recon_str(link_recon), xdim, ydim, zdim, tdim, test, get_gauge_order_str(gauge_order)); #ifdef MULTI_GPU printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), dimPartitioned(3)); #endif return ; }
void display_test_info() { printfQuda("running the following test:\n"); printfQuda("prec prec_sloppy multishift matpc_type recon recon_sloppy S_dimension T_dimension Ls_dimension dslash_type normalization\n"); printfQuda("%6s %6s %d %12s %2s %2s %3d/%3d/%3d %3d %2d %14s %8s\n", get_prec_str(prec),get_prec_str(prec_sloppy), multishift, get_matpc_str(matpc_type), get_recon_str(link_recon), get_recon_str(link_recon_sloppy), xdim, ydim, zdim, tdim, Lsdim, get_dslash_str(dslash_type), get_mass_normalization_str(normalization)); printfQuda("Grid partition info: X Y Z T\n"); printfQuda(" %d %d %d %d\n", dimPartitioned(0), dimPartitioned(1), dimPartitioned(2), dimPartitioned(3)); return ; }
static void massRescaleCoeff(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, QudaMassNormalization mass_normalization, double &coeff) { if (dslash_type == QUDA_ASQTAD_DSLASH) { if (mass_normalization != QUDA_MASS_NORMALIZATION) { errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION"); } return; } // multiply the source to compensate for normalization of the Dirac operator, if necessary switch (solution_type) { case QUDA_MAT_SOLUTION: if (mass_normalization == QUDA_MASS_NORMALIZATION || mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { coeff *= 2.0*kappa; } break; case QUDA_MATDAG_MAT_SOLUTION: if (mass_normalization == QUDA_MASS_NORMALIZATION || mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { coeff *= 4.0*kappa*kappa; } break; case QUDA_MATPC_SOLUTION: if (mass_normalization == QUDA_MASS_NORMALIZATION) { coeff *= 4.0*kappa*kappa; } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { coeff *= 2.0*kappa; } break; case QUDA_MATPCDAG_MATPC_SOLUTION: if (mass_normalization == QUDA_MASS_NORMALIZATION) { coeff*=16.0*pow(kappa,4); } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { coeff*=4.0*kappa*kappa; } break; default: errorQuda("Solution type %d not supported", solution_type); } if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n"); }
int setNumaAffinity(int devid) { int cpu_cores[128]; int ncores=128; int rc = getNumaAffinity(devid, cpu_cores, &ncores); if(rc != 0){ warningQuda("Failed to determine NUMA affinity for device %d (possibly not applicable)", devid); return 1; } int which = devid % ncores; printfQuda("Setting NUMA affinity for device %d to CPU core %d\n", devid, cpu_cores[which]); /* for(int i=0;i < ncores;i++){ if (i != which ) continue; printfQuda("%d", cpu_cores[i]); if((i+1) < ncores){ printfQuda(","); } } printfQuda("\n"); */ cpu_set_t cpu_set; CPU_ZERO(&cpu_set); for(int i=0;i < ncores;i++){ if( i != which) continue; CPU_SET(cpu_cores[i], &cpu_set); } rc = sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set); if (rc != 0){ warningQuda("Failed to enforce NUMA affinity (probably due to lack of kernel support)"); return -1; } return 0; }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { int k=0; int rUpdate = 0; cudaColorSpinorField r(b); ColorSpinorParam param(x); param.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, param); mat(r, x, y); zeroCuda(y); double r2 = xmyNormCuda(b, r); rUpdate ++; param.precision = invParam.cuda_prec_sloppy; cudaColorSpinorField Ap(x, param); cudaColorSpinorField tmp(x, param); cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass cudaColorSpinorField *x_sloppy, *r_sloppy; if (invParam.cuda_prec_sloppy == x.Precision()) { param.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { param.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, param); r_sloppy = new cudaColorSpinorField(r, param); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); double r2_old; double src_norm = norm2(b); double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver double alpha, beta; double pAp; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = invParam.reliable_delta; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); quda::blas_flops = 0; stopwatchStart(); while (r2 > stop && k<invParam.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; r2_old = r2; r2 = axpyNormCuda(-alpha, Ap, rSloppy); // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; if (!(updateR || updateX)) { beta = r2 / r2_old; axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); } k++; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); invParam.secs = stopwatchReadSeconds(); if (k==invParam.maxiter) warningQuda("Exceeded maximum iterations %d", invParam.maxiter); if (invParam.verbosity >= QUDA_SUMMARIZE) printfQuda("CG: Reliable updates = %d\n", rUpdate); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); // printfQuda("%f gflops\n", gflops / stopwatchReadSeconds()); invParam.gflops = gflops; invParam.iter = k; quda::blas_flops = 0; if (invParam.verbosity >= QUDA_SUMMARIZE){ mat(r, x, y); double true_res = xmyNormCuda(b, r); printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", k, sqrt(r2/src_norm), sqrt(true_res / src_norm)); } if (invParam.cuda_prec_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } return; }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setSpinorSiteSize(6); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize); qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize); } fatlink = malloc(4*V*gaugeSiteSize*gSize); longlink = malloc(4*V*gaugeSiteSize*gSize); construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, &gaugeParam, dslash_type); const double cos_pi_3 = 0.5; // Cos(pi/3) const double sin_pi_3 = sqrt(0.75); // Sin(pi/3) for(int dir=0; dir<4; ++dir){ for(int i=0; i<V; ++i){ for(int j=0; j<gaugeSiteSize; ++j){ if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]; }else{ ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX; if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links if(j%2 == 0){ const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1]; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j] = real*cos_pi_3 - imag*sin_pi_3; ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3; } } ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j]; ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]; } } } } ColorSpinorParam csParam; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } #ifdef MULTI_GPU int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; // FIXME: currently assume staggered is SU(3) gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); ghost_longlink = (void**)cpuLong->Ghost(); gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION; loadGaugeQuda(fatlink, &gaugeParam); if (dslash_type == QUDA_ASQTAD_DSLASH) { gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); } #endif double time0 = -((double)clock()); // Start the timer double nrm2=0; double src2=0; int ret = 0; switch(test_type){ case 0: //even if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 1: //odd if(inv_type == QUDA_GCR_INVERTER){ inv_param.inv_type = QUDA_GCR_INVERTER; inv_param.gcrNkrylov = 50; }else if(inv_type == QUDA_PCG_INVERTER){ inv_param.inv_type = QUDA_PCG_INVERTER; } inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: #define NUM_OFFSETS 12 { double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205}; inv_param.num_offset = NUM_OFFSETS; // these can be set independently for (int i=0; i<inv_param.num_offset; i++) { inv_param.tol_offset[i] = inv_param.tol; inv_param.tol_hq_offset[i] = inv_param.tol_hq; } void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < inv_param.num_offset; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < inv_param.num_offset; i++){ outArray[i] = spinorOutArray[i]->V(); inv_param.offset[i] = 4*masses[i]*masses[i]; } len=Vh; if (test_type == 3) { inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else { inv_param.matpc_type = QUDA_MATPC_ODD_ODD; } invertMultiShiftQuda(outArray, in->V(), &inv_param); cudaDeviceSynchronize(); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity = QUDA_INVALID_PARITY; if (inv_param.solve_type == QUDA_NORMOP_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < inv_param.num_offset;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr); //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){ ret |=1; } } for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i]; } break; default: errorQuda("Unsupported test type"); }//switch if (test_type <=2){ double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z); double l2r = sqrt(nrm2/src2); printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n", inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr); printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.secs, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); } end(); return ret; }
void init() { gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gauge_param.X[0] = 12; gauge_param.X[1] = 12; gauge_param.X[2] = 12; gauge_param.X[3] = 12; setDims(gauge_param.X, Ls); gauge_param.anisotropy = 2.3; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = QUDA_RECONSTRUCT_12; gauge_param.reconstruct_sloppy = gauge_param.reconstruct; gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; gauge_param.type = QUDA_WILSON_LINKS; inv_param.inv_type = QUDA_CG_INVERTER; inv_param.mass = 0.01; inv_param.m5 = -1.5; kappa5 = 0.5/(5 + inv_param.m5); inv_param.Ls = Ls; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = dagger; inv_param.cpu_prec = cpu_prec; inv_param.cuda_prec = cuda_prec; gauge_param.ga_pad = 0; inv_param.sp_pad = 0; inv_param.cl_pad = 0; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) { inv_param.solution_type = QUDA_MAT_SOLUTION; } else { inv_param.solution_type = QUDA_MATPC_SOLUTION; } inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH; inv_param.verbosity = QUDA_VERBOSE; // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor = 3; csParam.nSpin = 4; csParam.nDim = 5; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; csParam.x[4] = Ls; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gauge_param.X[0]; printfQuda("Randomizing fields... "); construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); spinor->Source(QUDA_RANDOM_SOURCE); printfQuda("done.\n"); fflush(stdout); int dev = 0; initQuda(dev); printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (!transfer) { csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (csParam.precision == QUDA_DOUBLE_PRECISION ) { csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; } else { /* Single and half */ csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; } if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); if (test_type == 2) csParam.x[0] /= 2; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp = new cudaColorSpinorField(csParam); printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << norm2(*cudaSpinor) << std::endl; bool pc = (test_type != 2); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_DEBUG_VERBOSE; diracParam.tmp1 = tmp; diracParam.tmp2 = tmp2; dirac = Dirac::create(diracParam); } else { std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; } }
int invert_test(void) { QudaGaugeParam gaugeParam = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); double mass = 0.1; set_params(&gaugeParam, &inv_param, xdim, ydim, zdim, tdim, cpu_prec, prec, prec_sloppy, link_recon, link_recon_sloppy, mass, tol, 500, 1e-3, 0.8); // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME) initQuda(device); setDims(gaugeParam.X); setDimConstants(gaugeParam.X); size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); for (int dir = 0; dir < 4; dir++) { fatlink[dir] = malloc(V*gaugeSiteSize*gSize); longlink[dir] = malloc(V*gaugeSiteSize*gSize); } construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam); for (int dir = 0; dir < 4; dir++) { for(int i = 0;i < V*gaugeSiteSize;i++){ if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){ ((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX; }else{ ((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX; } } } ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor=3; csParam.nSpin=1; csParam.nDim=4; for(int d = 0; d < 4; d++) { csParam.x[d] = gaugeParam.X[d]; } csParam.x[0] /= 2; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; in = new cpuColorSpinorField(csParam); out = new cpuColorSpinorField(csParam); ref = new cpuColorSpinorField(csParam); tmp = new cpuColorSpinorField(csParam); if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){ constructSpinorField((float*)in->V()); }else{ constructSpinorField((double*)in->V()); } int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*tdim/2); tmp_value = MAX(tmp_value, xdim*ydim*zdim/2); int fat_pad = tmp_value; int link_pad = 3*tmp_value; #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; GaugeFieldParam cpuFatParam(fatlink, gaugeParam); cpuFat = new cpuGaugeField(cpuFatParam); cpuFat->exchangeGhost(); ghost_fatlink = (void**)cpuFat->Ghost(); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; GaugeFieldParam cpuLongParam(longlink, gaugeParam); cpuLong = new cpuGaugeField(cpuLongParam); cpuLong->exchangeGhost(); ghost_longlink = (void**)cpuLong->Ghost(); #endif if(testtype == 6){ record_gauge(gaugeParam.X, fatlink, fat_pad, longlink, link_pad, link_recon, link_recon_sloppy, &gaugeParam); }else{ #ifdef MULTI_GPU gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.ga_pad = fat_pad; gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.ga_pad = link_pad; gaugeParam.reconstruct= link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #else gaugeParam.type = QUDA_ASQTAD_FAT_LINKS; gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; loadGaugeQuda(fatlink, &gaugeParam); gaugeParam.type = QUDA_ASQTAD_LONG_LINKS; gaugeParam.reconstruct = link_recon; gaugeParam.reconstruct_sloppy = link_recon_sloppy; loadGaugeQuda(longlink, &gaugeParam); #endif } double time0 = -((double)clock()); // Start the timer unsigned long volume = Vh; unsigned long nflops=2*1187; //from MILC's CG routine double nrm2=0; double src2=0; int ret = 0; switch(testtype){ case 0: //even volume = Vh; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); { double sol = norm_2(out->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); double refe = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); } break; case 1: //odd volume = Vh; inv_param.matpc_type = QUDA_MATPC_ODD_ODD; invertQuda(out->V(), in->V(), &inv_param); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY); #else matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY); #endif mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec); break; case 2: //full spinor errorQuda("full spinor not supported\n"); break; case 3: //multi mass CG, even case 4: case 5: case 6: #define NUM_OFFSETS 7 nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81}; double offsets[NUM_OFFSETS]; int num_offsets =NUM_OFFSETS; void* outArray[NUM_OFFSETS]; int len; cpuColorSpinorField* spinorOutArray[NUM_OFFSETS]; spinorOutArray[0] = out; for(int i=1;i < num_offsets; i++){ spinorOutArray[i] = new cpuColorSpinorField(csParam); } for(int i=0;i < num_offsets; i++){ outArray[i] = spinorOutArray[i]->V(); } for (int i=0; i< num_offsets;i++){ offsets[i] = 4*masses[i]*masses[i]; } len=Vh; volume = Vh; if (testtype == 3 || testtype == 6){ inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (testtype == 4){ inv_param.matpc_type = QUDA_MATPC_ODD_ODD; }else { //testtype ==5 errorQuda("test 5 not supported\n"); } double residue_sq; if (testtype == 6){ invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); }else{ invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq); } cudaThreadSynchronize(); printfQuda("Final residue squred =%g\n", residue_sq); time0 += clock(); // stop the timer time0 /= CLOCKS_PER_SEC; printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); printfQuda("checking the solution\n"); QudaParity parity; if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){ //parity = QUDA_EVENODD_PARITY; errorQuda("full parity not supported\n"); }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){ parity = QUDA_EVEN_PARITY; }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){ parity = QUDA_ODD_PARITY; }else{ errorQuda("ERROR: invalid spinor parity \n"); exit(1); } for(int i=0;i < num_offsets;i++){ printfQuda("%dth solution: mass=%f, ", i, masses[i]); #ifdef MULTI_GPU matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity); #else matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity); #endif mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec); double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec); printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret |=1; } } if (ret ==1){ errorQuda("Converge failed!\n"); } for(int i=1; i < num_offsets;i++){ delete spinorOutArray[i]; } }//switch if (testtype <=2){ printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2)); printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", time0, inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge if (sqrt(nrm2/src2) > 100*inv_param.tol){ ret = 1; errorQuda("Convergence failed!\n"); } } end(); return ret; }
void init(int argc, char **argv) { kernelPackT = false; // Set true for kernel T face packing cuda_prec= prec; gauge_param = newQudaGaugeParam(); inv_param = newQudaInvertParam(); gauge_param.X[0] = xdim; gauge_param.X[1] = ydim; gauge_param.X[2] = zdim; gauge_param.X[3] = tdim; setDims(gauge_param.X); gauge_param.anisotropy = 1.0; gauge_param.type = QUDA_WILSON_LINKS; gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; gauge_param.t_boundary = QUDA_PERIODIC_T; gauge_param.cpu_prec = cpu_prec; gauge_param.cuda_prec = cuda_prec; gauge_param.reconstruct = link_recon; gauge_param.reconstruct_sloppy = link_recon; gauge_param.cuda_prec_sloppy = cuda_prec; gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; inv_param.kappa = 0.1; if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { inv_param.mu = 0.01; inv_param.twist_flavor = QUDA_TWIST_MINUS; } inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.dagger = dagger; inv_param.cpu_prec = cpu_prec; if (inv_param.cpu_prec != gauge_param.cpu_prec) errorQuda("Gauge and spinor cpu precisions must match"); inv_param.cuda_prec = cuda_prec; #ifndef MULTI_GPU // free parameter for single GPU gauge_param.ga_pad = 0; #else // must be this one c/b face for multi gpu int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; #endif inv_param.sp_pad = 0; inv_param.cl_pad = 0; //inv_param.sp_pad = 24*24*24; //inv_param.cl_pad = 24*24*24; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis inv_param.dirac_order = QUDA_DIRAC_ORDER; if (test_type == 2) { inv_param.solution_type = QUDA_MAT_SOLUTION; } else { inv_param.solution_type = QUDA_MATPC_SOLUTION; } inv_param.dslash_type = dslash_type; if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { inv_param.clover_cpu_prec = cpu_prec; inv_param.clover_cuda_prec = cuda_prec; inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; //if (test_type > 0) { hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); hostCloverInv = hostClover; // fake it /*} else { hostClover = NULL; hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec); }*/ } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { } //inv_param.verbosity = QUDA_VERBOSE; // construct input fields for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec); ColorSpinorParam csParam; csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION; csParam.nColor = 3; csParam.nSpin = 4; if (dslash_type == QUDA_TWISTED_MASS_DSLASH) { csParam.twistFlavor = inv_param.twist_flavor; } csParam.nDim = 4; for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d]; csParam.precision = inv_param.cpu_prec; csParam.pad = 0; if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } else { csParam.siteSubset = QUDA_FULL_SITE_SUBSET; } csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER; csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.gammaBasis = inv_param.gamma_basis; csParam.create = QUDA_ZERO_FIELD_CREATE; //csParam.verbose = QUDA_DEBUG_VERBOSE; spinor = new cpuColorSpinorField(csParam); spinorOut = new cpuColorSpinorField(csParam); spinorRef = new cpuColorSpinorField(csParam); csParam.siteSubset = QUDA_FULL_SITE_SUBSET; csParam.x[0] = gauge_param.X[0]; printfQuda("Randomizing fields... "); if (strcmp(latfile,"")) { // load in the command line supplied gauge field read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv); construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param); } else { // else generate a random SU(3) field construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param); } spinor->Source(QUDA_RANDOM_SOURCE); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { double norm = 0.0; // clover components are random numbers in the range (-norm, norm) double diag = 1.0; // constant added to the diagonal if (test_type == 2) { construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec); } else { construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec); } } printfQuda("done.\n"); fflush(stdout); initQuda(device); printfQuda("Sending gauge field to GPU\n"); loadGaugeQuda(hostGauge, &gauge_param); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { printfQuda("Sending clover field to GPU\n"); loadCloverQuda(hostClover, hostCloverInv, &inv_param); //clover = cudaCloverPrecise; } if (!transfer) { csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION; csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; csParam.pad = inv_param.sp_pad; csParam.precision = inv_param.cuda_prec; if (csParam.precision == QUDA_DOUBLE_PRECISION ) { csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; } else { /* Single and half */ csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER; } if (test_type < 2) { csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; csParam.x[0] /= 2; } printfQuda("Creating cudaSpinor\n"); cudaSpinor = new cudaColorSpinorField(csParam); printfQuda("Creating cudaSpinorOut\n"); cudaSpinorOut = new cudaColorSpinorField(csParam); if (test_type == 2) csParam.x[0] /= 2; csParam.siteSubset = QUDA_PARITY_SITE_SUBSET; tmp1 = new cudaColorSpinorField(csParam); if (dslash_type == QUDA_CLOVER_WILSON_DSLASH || dslash_type == QUDA_TWISTED_MASS_DSLASH) { tmp2 = new cudaColorSpinorField(csParam); } printfQuda("Sending spinor field to GPU\n"); *cudaSpinor = *spinor; std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << norm2(*cudaSpinor) << std::endl; bool pc = (test_type != 2); DiracParam diracParam; setDiracParam(diracParam, &inv_param, pc); diracParam.verbose = QUDA_VERBOSE; diracParam.tmp1 = tmp1; diracParam.tmp2 = tmp2; dirac = Dirac::create(diracParam); } else { std::cout << "Source: CPU = " << norm2(*spinor) << std::endl; } }
int main(int argc, char **argv) { for (int i =1;i < argc; i++){ if(process_command_line_option(argc, argv, &i) == 0){ continue; } fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); usage(argv); } initCommsQuda(argc, argv, gridsize_from_cmdline, 4); display_test_info(); init(argc, argv); float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30); printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB); printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); int attempts = 1; dslashRef(); for (int i=0; i<attempts; i++) { if (tune) { // warm-up run printfQuda("Tuning...\n"); setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); dslashCUDA(1); } printfQuda("Executing %d kernel loops...\n", loops); dirac->Flops(); double secs = dslashCUDA(loops); printfQuda("done.\n\n"); #ifdef DSLASH_PROFILING printDslashProfile(); #endif if (!transfer) *spinorOut = *cudaSpinorOut; // print timing information printfQuda("%fms per loop\n", 1000*secs); unsigned long long flops = 0; if (!transfer) flops = dirac->Flops(); int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24; if (inv_param.cuda_prec == QUDA_HALF_PRECISION) spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct; if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { gauge_floats += test_type ? 72*2 : 72; } printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); printfQuda("GB/s = %f\n\n", Vh*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9)); if (!transfer) { double norm2_cpu = norm2(*spinorRef); double norm2_cuda= norm2(*cudaSpinorOut); double norm2_cpu_cuda= norm2(*spinorOut); printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda); } else { double norm2_cpu = norm2(*spinorRef); double norm2_cpu_cuda= norm2(*spinorOut); printfQuda("Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda); } cpuColorSpinorField::Compare(*spinorRef, *spinorOut); } end(); endCommsQuda(); }
/*! * * Generic version of the multi-shift solver. Should work for * most fermions. Note, offset[0] is not folded into the mass parameter */ void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param, double* offsets, int num_offsets, double* residue_sq) { // check the gauge fields have been created cudaGaugeField *cudaGauge = checkGauge(param); checkInvertParam(param); param->num_offset = num_offsets; if (param->num_offset > QUDA_MAX_MULTI_SHIFT) errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", param->num_offset, QUDA_MAX_MULTI_SHIFT); for (int i=0; i<param->num_offset; i++) { param->offset[i] = offsets[i]; param->tol_offset[i] = residue_sq[i]; } verbosity = param->verbosity; // Are we doing a preconditioned solve */ /* What does NormEq solve mean in the shifted case? */ if (param->solve_type != QUDA_NORMEQ_PC_SOLVE && param->solve_type != QUDA_NORMEQ_SOLVE) { errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n"); } bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE); // In principle one can do a MATPC Solution for a hermitian M_pc // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution. bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION ); // No of GiB in a checkerboard of a spinor param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize; if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve // **** WARNING *** this may not match implementation... if( param->inv_type == QUDA_CG_INVERTER ) { // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30); } else { // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary // (Jegerlehner hep-lat/9612014 eq (3.13) param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30); } // Timing and FLOP counters param->secs = 0; param->gflops = 0; param->iter = 0; // Find the smallest shift and its offset. double low_offset = param->offset[0]; int low_index = 0; for (int i=1;i < param->num_offset;i++){ if (param->offset[i] < low_offset){ low_offset = param->offset[i]; low_index = i; } } // Host pointers for x, take a copy of the input host pointers void** hp_x; hp_x = new void* [ param->num_offset ]; void* hp_b = _hp_b; for(int i=0;i < param->num_offset;i++){ hp_x[i] = _hp_x[i]; } // Now shift things so that the vector with the smallest shift // is in the first position of the array if (low_index != 0){ void* tmp = hp_x[0]; hp_x[0] = hp_x[low_index] ; hp_x[low_index] = tmp; double tmp1 = param->offset[0]; param->offset[0]= param->offset[low_index]; param->offset[low_index] =tmp1; } // Create the matrix. // The way this works is that createDirac will create 'd' and 'dSloppy' // which are global. We then grab these with references... // // Balint: Isn't there a nice construction pattern we could use here? This is // expedient but yucky. DiracParam diracParam; if (param->dslash_type == QUDA_ASQTAD_DSLASH){ param->mass = sqrt(param->offset[0]/4); } createDirac(diracParam, *param, pc_solve); Dirac &dirac = *d; Dirac &diracSloppy = *dSloppy; cpuColorSpinorField *h_b = NULL; // Host RHS cpuColorSpinorField **h_x = NULL; cudaColorSpinorField *b = NULL; // Cuda RHS cudaColorSpinorField **x = NULL; // Cuda Solutions // Grab the dimension array of the input gauge field. const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ? gaugeFatPrecise->X() : gaugeFatPrecise->X(); // Wrap CPU host side pointers // // Balint: This creates a ColorSpinorParam struct, from the host data pointer, // the definitions in param, the dimensions X, and whether the solution is on // a checkerboard instruction or not. These can then be used as 'instructions' // to create the actual colorSpinorField ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution); h_b = new cpuColorSpinorField(cpuParam); h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION for(int i=0; i < param->num_offset; i++) { cpuParam.v = hp_x[i]; h_x[i] = new cpuColorSpinorField(cpuParam); } // Now I need a colorSpinorParam for the device ColorSpinorParam cudaParam(cpuParam, *param); // This setting will download a host vector cudaParam.create = QUDA_COPY_FIELD_CREATE; b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it // Create the solution fields filled with zero x = new cudaColorSpinorField* [ param->num_offset ]; cudaParam.create = QUDA_ZERO_FIELD_CREATE; for(int i=0; i < param->num_offset; i++) { x[i] = new cudaColorSpinorField(cudaParam); } // Check source norms if( param->verbosity >= QUDA_VERBOSE ) { double nh_b = norm2(*h_b); double nb = norm2(*b); printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb); } // tune the Dirac Kernel tuneDirac(*param, pc_solution ? *(x[0]) : (x[0])->Even()); massRescale(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, *b); double *rescaled_shifts = new double [param->num_offset]; for(int i=0; i < param->num_offset; i++){ rescaled_shifts[i] = param->offset[i]; massRescaleCoeff(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, rescaled_shifts[i]); } { DiracMdagM m(dirac), mSloppy(diracSloppy); MultiShiftCG cg_m(m, mSloppy, *param); cg_m(x, *b); } delete [] rescaled_shifts; for(int i=0; i < param->num_offset; i++) { x[i]->saveCPUSpinorField(*h_x[i]); } for(int i=0; i < param->num_offset; i++){ delete h_x[i]; delete x[i]; } delete h_b; delete b; delete [] h_x; delete [] x; delete [] hp_x; if (!param->preserve_dirac) { delete d; d =NULL; delete dSloppy; dSloppy = NULL; delete dPre; dPre = NULL; diracCreation = false; diracTune = false; } return; }