/* * Read tunecache from disk. */ void loadTuneCache(QudaVerbosity verbosity) { char *path; struct stat pstat; std::string cache_path, line, token; std::ifstream cache_file; std::stringstream ls; path = getenv("QUDA_RESOURCE_PATH"); if (!path) { warningQuda("Environment variable QUDA_RESOURCE_PATH is not set."); warningQuda("Caching of tuned parameters will be disabled."); return; } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) { warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path); warningQuda("Caching of tuned parameters will be disabled."); return; } else { resource_path = path; } #ifdef MULTI_GPU if (comm_rank() == 0) { #endif cache_path = resource_path; cache_path += "/tunecache.tsv"; cache_file.open(cache_path.c_str()); if (cache_file) { if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); getline(cache_file, line); ls.str(line); ls >> token; if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str()); ls >> token; if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str()); ls >> token; if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str()); if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); getline(cache_file, line); // eat the blank line if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str()); getline(cache_file, line); // eat the description line deserializeTuneCache(cache_file); cache_file.close(); initial_cache_size = tunecache.size(); if (verbosity >= QUDA_SUMMARIZE) { printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str()); } } else {
void TimeProfile::PrintGlobal() { if (global_profile[QUDA_PROFILE_TOTAL].time > 0.0) { printfQuda("\n %20s Total time = %g secs\n", "QUDA", global_profile[QUDA_PROFILE_TOTAL].time); } double accounted = 0.0; bool print_timer = true; // whether to print that timer for (int i=0; i<QUDA_PROFILE_COUNT-1; i++) { if (i==QUDA_PROFILE_LOWER_LEVEL) print_timer=false; // we do not want to print detailed lower level timers if (global_profile[i].count > 0) { if (print_timer) printfQuda(" %17s = %f secs (%6.3g%%), with %8d calls at %e us per call\n", (const char*)&pname[i][0], global_profile[i].time, 100*global_profile[i].time/global_profile[QUDA_PROFILE_TOTAL].time, global_profile[i].count, 1e6*global_profile[i].time/global_profile[i].count); accounted += global_profile[i].time; } } if (accounted > 0.0) { double missing = global_profile[QUDA_PROFILE_TOTAL].time - accounted; printfQuda(" total accounted = %f secs (%6.3g%%)\n", accounted, 100*accounted/global_profile[QUDA_PROFILE_TOTAL].time); printfQuda(" total missing = %f secs (%6.3g%%)\n", missing, 100*missing/global_profile[QUDA_PROFILE_TOTAL].time); } if (accounted > global_profile[QUDA_PROFILE_TOTAL].time) { warningQuda("Accounted time %f secs in %s is greater than total time %f secs\n", accounted, "QUDA", global_profile[QUDA_PROFILE_TOTAL].time); } }
/**< Print out the profile information */ void TimeProfile::Print() { if (profile[QUDA_PROFILE_TOTAL].time > 0.0) { printfQuda("\n %20s Total time = %g secs\n", fname.c_str(), profile[QUDA_PROFILE_TOTAL].time); } double accounted = 0.0; for (int i=0; i<QUDA_PROFILE_COUNT-1; i++) { if (profile[i].count > 0) { printfQuda(" %17s = %f secs (%6.3g%%), with %8d calls at %e us per call\n", (const char*)&pname[i][0], profile[i].time, 100*profile[i].time/profile[QUDA_PROFILE_TOTAL].time, profile[i].count, 1e6*profile[i].time/profile[i].count); accounted += profile[i].time; } } if (accounted > 0.0) { double missing = profile[QUDA_PROFILE_TOTAL].time - accounted; printfQuda(" total accounted = %f secs (%6.3g%%)\n", accounted, 100*accounted/profile[QUDA_PROFILE_TOTAL].time); printfQuda(" total missing = %f secs (%6.3g%%)\n", missing, 100*missing/profile[QUDA_PROFILE_TOTAL].time); } if (accounted > profile[QUDA_PROFILE_TOTAL].time) { warningQuda("Accounted time %f secs in %s is greater than total time %f secs\n", accounted, (const char*)&fname[0], profile[QUDA_PROFILE_TOTAL].time); } }
static int process_core_string_list(const char* _str, int* list, int* ncores) { /* The input string @str should be separated by comma, and each item can be * either a number or a range (see the comments in process_core_string_item * function) * */ if(_str == NULL || list == NULL || ncores == NULL || *ncores <= 0){ warningQuda("Bad argument"); return -1; } char str[256]; strncpy(str, _str, sizeof(str)); int left_space = *ncores; int tot_cores = 0; char* item = strtok(str, ","); if(item == NULL){ warningQuda("Invalid string format (%s)", str); return -1; } do { int sub_ncores = left_space; int* sub_list = list + tot_cores; int rc = process_core_string_item(item, sub_list, &sub_ncores); if(rc <0){ warningQuda("Processing item (%s) failed", item); return -1; } tot_cores += sub_ncores; left_space -= sub_ncores; item = strtok(NULL, ","); }while( item != NULL); *ncores = tot_cores; return 0; }
static int process_core_string_item(const char* str, int* sub_list, int* sub_ncores) { /* assume the input format is one of the following two * 1. a number only, e.g. 5 * 2. a range, e.g 4-6, which means three numbers 4,5,6 * return a list of numbers in @sub_list and and the total numbers * in @sub_ncores */ int i; if(str == NULL || sub_list == NULL || sub_ncores == NULL || *sub_ncores <= 0){ warningQuda("Bad argument"); return -1; } if(strstr(str, "-") != NULL){ //a range int low_core, high_core; if (sscanf(str,"%d-%d",&low_core, &high_core) != 2){ warningQuda("Range scan failed"); return -1; } if(*sub_ncores < high_core-low_core +1){ warningQuda("Not enough space in sub_list"); return -1; } for(i = 0; i < high_core-low_core +1; i++){ sub_list[i] = i + low_core; } *sub_ncores = high_core - low_core +1; }else{ //a number int core; if (sscanf(str, "%d", &core) != 1){ warningQuda("Wrong format for core number"); return -1; } sub_list[0] = core; *sub_ncores =1; } return 0; }
static int getNumaAffinity(int my_gpu, int *cpu_cores, int* ncores) { FILE *nvidia_info, *pci_bus_info; size_t nbytes = 255; char *my_line; char nvidia_info_path[255], pci_bus_info_path[255]; char bus_info[255]; // the nvidia driver populates this path for each gpu sprintf(nvidia_info_path,"/proc/driver/nvidia/gpus/%d/information", my_gpu); nvidia_info= fopen(nvidia_info_path,"r"); if (nvidia_info == NULL){ return -1; } my_line= (char *) safe_malloc(nbytes +1); while (!feof(nvidia_info)){ if ( -1 == getline(&my_line, &nbytes, nvidia_info)){ break; }else{ // the first 7 char of the Bus Location will lead to the corresponding // path under /sys/class/pci_bus/ , cpulistaffinity showing cores on that // bus is located there if ( 1 == sscanf(my_line,"Bus Location: %s", bus_info )){ sprintf(pci_bus_info_path,"/sys/class/pci_bus/%.7s/cpulistaffinity", bus_info); } } } // open the cpulistaffinity file on the pci_bus for "my_gpu" pci_bus_info= fopen(pci_bus_info_path,"r"); if (pci_bus_info == NULL){ //printfQuda("Warning: opening file %s failed\n", pci_bus_info_path); host_free(my_line); fclose(nvidia_info); return -1; } while (!feof(pci_bus_info)){ if ( -1 == getline(&my_line, &nbytes, pci_bus_info)){ break; } else{ int rc = process_core_string_list(my_line, cpu_cores, ncores); if(rc < 0){ warningQuda("Failed to process the line \"%s\"", my_line); host_free(my_line); fclose(nvidia_info); return -1; } } } host_free(my_line); return 0; }
void pushVerbosity(QudaVerbosity verbosity) { vstack.push(getVerbosity()); setVerbosity(verbosity); if (vstack.size() > 10) { warningQuda("Verbosity stack contains %u elements. Is there a missing popVerbosity() somewhere?", static_cast<unsigned int>(vstack.size())); } }
int setNumaAffinity(int devid) { int cpu_cores[128]; int ncores=128; int rc = getNumaAffinity(devid, cpu_cores, &ncores); if(rc != 0){ warningQuda("Failed to determine NUMA affinity for device %d (possibly not applicable)", devid); return 1; } int which = devid % ncores; printfQuda("Setting NUMA affinity for device %d to CPU core %d\n", devid, cpu_cores[which]); /* for(int i=0;i < ncores;i++){ if (i != which ) continue; printfQuda("%d", cpu_cores[i]); if((i+1) < ncores){ printfQuda(","); } } printfQuda("\n"); */ cpu_set_t cpu_set; CPU_ZERO(&cpu_set); for(int i=0;i < ncores;i++){ if( i != which) continue; CPU_SET(cpu_cores[i], &cpu_set); } rc = sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set); if (rc != 0){ warningQuda("Failed to enforce NUMA affinity (probably due to lack of kernel support)"); return -1; } return 0; }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { int k=0; int rUpdate = 0; cudaColorSpinorField r(b); ColorSpinorParam param(x); param.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, param); mat(r, x, y); zeroCuda(y); double r2 = xmyNormCuda(b, r); rUpdate ++; param.precision = invParam.cuda_prec_sloppy; cudaColorSpinorField Ap(x, param); cudaColorSpinorField tmp(x, param); cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass cudaColorSpinorField *x_sloppy, *r_sloppy; if (invParam.cuda_prec_sloppy == x.Precision()) { param.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { param.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, param); r_sloppy = new cudaColorSpinorField(r, param); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); double r2_old; double src_norm = norm2(b); double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver double alpha, beta; double pAp; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = invParam.reliable_delta; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); quda::blas_flops = 0; stopwatchStart(); while (r2 > stop && k<invParam.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; r2_old = r2; r2 = axpyNormCuda(-alpha, Ap, rSloppy); // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; if (!(updateR || updateX)) { beta = r2 / r2_old; axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); } k++; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); invParam.secs = stopwatchReadSeconds(); if (k==invParam.maxiter) warningQuda("Exceeded maximum iterations %d", invParam.maxiter); if (invParam.verbosity >= QUDA_SUMMARIZE) printfQuda("CG: Reliable updates = %d\n", rUpdate); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); // printfQuda("%f gflops\n", gflops / stopwatchReadSeconds()); invParam.gflops = gflops; invParam.iter = k; quda::blas_flops = 0; if (invParam.verbosity >= QUDA_SUMMARIZE){ mat(r, x, y); double true_res = xmyNormCuda(b, r); printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", k, sqrt(r2/src_norm), sqrt(true_res / src_norm)); } if (invParam.cuda_prec_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } return; }
void qudaSetNumaConfig(char* filename) { static int already_set = 0; if(already_set){ return; } already_set =1; if(filename ==NULL){ errorQuda("numa config filename is NULL\n"); } if(strlen(filename) >= 128){ errorQuda("numa config filename too long\n"); } FILE* fd = fopen(filename, "r"); if (fd == NULL){ warningQuda("opening numa config file(%s) failed",filename ); return; } for(int i=0;i < MAX_GPU_NUM_PER_NODE; i++){ gpu_affinity[i] = -1; } char buf[1024]; while ( fgets(buf, 1024, fd) != NULL){ if (buf[0]== '\n' || buf[0] == '#'){ continue; } char* token[4]; token[0] = (char*)strtok(buf, " \t\n"); token[1] = (char*)strtok(NULL, " \t\n"); token[2] = (char*)strtok(NULL, " \t\n"); token[3] = (char*)strtok(NULL, " \t\n"); if(strcmp(token[0], "affinity") != 0){ warningQuda("Invalid format for the numa config file\n"); fclose(fd); return ; } if (token[1] == NULL || token[2] == NULL){ warningQuda("invalid entry for affinity\n"); fclose(fd); return; } int gpunum = atoi(token[1]); int nodenum = atoi(token[2]); if(gpunum < 0 ||nodenum < 0){ warningQuda("Invalid gpunum(%d) or nodenum(%d)\n", gpunum, nodenum); fclose(fd); return; } gpu_affinity[gpunum] = nodenum; } fclose(fd); numa_config_set = 1; return; }
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param) { if (!h_clover && !h_clovinv) { errorQuda("loadCloverQuda() called with neither clover term nor inverse"); } if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) { errorQuda("Half precision not supported on CPU"); } if (gaugePrecise == NULL) { errorQuda("Gauge field must be loaded before clover"); } if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) { errorQuda("Wrong dslash_type in loadCloverQuda()"); } // determines whether operator is preconditioned when calling invertQuda() bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE || inv_param->solve_type == QUDA_NORMEQ_PC_SOLVE); // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda() bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION || inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC || inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC); // We issue a warning only when it seems likely that the user is screwing up: // inverted clover term is required when applying preconditioned operator if (!h_clovinv && pc_solve && pc_solution) { warningQuda("Inverted clover term not loaded"); } // uninverted clover term is required when applying unpreconditioned operator, // but note that dslashQuda() is always preconditioned if (!h_clover && !pc_solve && !pc_solution) { //warningQuda("Uninverted clover term not loaded"); } // uninverted clover term is also required for "asymmetric" preconditioning if (!h_clover && pc_solve && pc_solution && asymmetric) { warningQuda("Uninverted clover term not loaded"); } CloverFieldParam clover_param; clover_param.nDim = 4; for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i]; clover_param.precision = inv_param->clover_cuda_prec; clover_param.pad = inv_param->cl_pad; cloverPrecise = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order, clover_param); inv_param->cloverGiB = cloverPrecise->GBytes(); if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) { clover_param.precision = inv_param->clover_cuda_prec_sloppy; cloverSloppy = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, inv_param->clover_order, clover_param); inv_param->cloverGiB += cloverSloppy->GBytes(); } else { cloverSloppy = cloverPrecise; } endInvertQuda(); // need to delete any persistant dirac operators }
void CG3::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { // Check to see that we're not trying to invert on a zero-field source const double b2 = norm2(b); if(b2 == 0){ profile.TPSTOP(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x=b; param.true_res = 0.0; param.true_res_hq = 0.0; return; } ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField x_prev(b, csParam); cudaColorSpinorField r_prev(b, csParam); cudaColorSpinorField temp(b, csParam); cudaColorSpinorField r(b); cudaColorSpinorField w(b); mat(r, x, temp); // r = Mx double r2 = xmyNormCuda(b,r); // r = b - Mx PrintStats("CG3", 0, r2, b2, 0.0); double stop = stopping(param.tol, b2, param.residual_type); if(convergence(r2, 0.0, stop, 0.0)) return; // First iteration mat(w, r, temp); double rAr = reDotProductCuda(r,w); double rho = 1.0; double gamma_prev = 0.0; double gamma = r2/rAr; cudaColorSpinorField x_new(x); cudaColorSpinorField r_new(r); axpyCuda(gamma, r, x_new); // x_new += gamma*r axpyCuda(-gamma, w, r_new); // r_new -= gamma*w // end of first iteration // axpbyCuda(a,b,x,y) => y = a*x + b*y int k = 1; // First iteration performed above double r2_prev; while(!convergence(r2, 0.0, stop, 0.0) && k<param.maxiter){ x_prev = x; x = x_new; r_prev = r; r = r_new; mat(w, r, temp); rAr = reDotProductCuda(r,w); r2_prev = r2; r2 = norm2(r); // Need to rearrange this! PrintStats("CG3", k, r2, b2, 0.0); gamma_prev = gamma; gamma = r2/rAr; rho = 1.0/(1. - (gamma/gamma_prev)*(r2/r2_prev)*(1.0/rho)); x_new = x; axCuda(rho,x_new); axpyCuda(rho*gamma,r,x_new); axpyCuda((1. - rho),x_prev,x_new); r_new = r; axCuda(rho,r_new); axpyCuda(-rho*gamma,w,r_new); axpyCuda((1.-rho),r_prev,r_new); double rr_old = reDotProductCuda(r_new,r); printfQuda("rr_old = %1.14lf\n", rr_old); k++; } if(k == param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); // compute the true residual mat(r, x, temp); param.true_res = sqrt(xmyNormCuda(b, r)/b2); PrintSummary("CG3", k, r2, b2); return; }
void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b) { int num_offset = invParam.num_offset; double *offset = invParam.offset; double *residue_sq = invParam.tol_offset; if (num_offset == 0) return; int *finished = new int [num_offset]; double *zeta_i = new double[num_offset]; double *zeta_im1 = new double[num_offset]; double *zeta_ip1 = new double[num_offset]; double *beta_i = new double[num_offset]; double *beta_im1 = new double[num_offset]; double *alpha = new double[num_offset]; int i, j; int j_low = 0; int num_offset_now = num_offset; for (i=0; i<num_offset; i++) { finished[i] = 0; zeta_im1[i] = zeta_i[i] = 1.0; beta_im1[i] = -1.0; alpha[i] = 0.0; } //double msq_x4 = offset[0]; cudaColorSpinorField *r = new cudaColorSpinorField(b); cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset], *r_sloppy; ColorSpinorParam param; param.create = QUDA_ZERO_FIELD_CREATE; param.precision = invParam.cuda_prec_sloppy; if (invParam.cuda_prec_sloppy == x[0]->Precision()) { for (i=0; i<num_offset; i++){ x_sloppy[i] = x[i]; zeroCuda(*x_sloppy[i]); } r_sloppy = r; } else { for (i=0; i<num_offset; i++) { x_sloppy[i] = new cudaColorSpinorField(*x[i], param); } param.create = QUDA_COPY_FIELD_CREATE; r_sloppy = new cudaColorSpinorField(*r, param); } cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset]; for(i=0;i < num_offset;i++){ p[i]= new cudaColorSpinorField(*r_sloppy); } param.create = QUDA_ZERO_FIELD_CREATE; param.precision = invParam.cuda_prec_sloppy; cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param); double b2 = 0.0; b2 = normCuda(b); double r2 = b2; double r2_old; double stop = r2*invParam.tol*invParam.tol; // stopping condition of solver double pAp; int k = 0; stopwatchStart(); while (r2 > stop && k < invParam.maxiter) { //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0); //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4); matSloppy(*Ap, *p[0]); if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){ axpyCuda(offset[0], *p[0], *Ap); } pAp = reDotProductCuda(*p[0], *Ap); beta_i[0] = r2 / pAp; zeta_ip1[0] = 1.0; for (j=1; j<num_offset_now; j++) { zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low]; double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]); double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+(offset[j]-offset[0])*beta_i[j_low]); /*THISBLOWSUP zeta_ip1[j] /= c1 + c2; beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j]; */ /*TRYTHIS*/ if( (c1+c2) != 0.0 ) zeta_ip1[j] /= (c1 + c2); else { zeta_ip1[j] = 0.0; finished[j] = 1; } if( zeta_i[j] != 0.0) { beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j]; } else { zeta_ip1[j] = 0.0; beta_i[j] = 0.0; finished[j] = 1; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("SETTING A ZERO, j=%d, num_offset_now=%d\n",j,num_offset_now); //if(j==num_offset_now-1)node0_PRINTF("REDUCING OFFSET\n"); if(j==num_offset_now-1) num_offset_now--; // don't work any more on finished solutions // this only works if largest offsets are last, otherwise // just wastes time multiplying by zero } } r2_old = r2; r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy); alpha[0] = r2 / r2_old; for (j=1; j<num_offset_now; j++) { /*THISBLOWSUP alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] / (zeta_i[j] * beta_i[j_low]); */ /*TRYTHIS*/ if( zeta_i[j] * beta_i[j_low] != 0.0) alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] / (zeta_i[j] * beta_i[j_low]); else { alpha[j] = 0.0; finished[j] = 1; } } axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]); for (j=1; j<num_offset_now; j++) { axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]); } for (j=0; j<num_offset_now; j++) { beta_im1[j] = beta_i[j]; zeta_im1[j] = zeta_i[j]; zeta_i[j] = zeta_ip1[j]; } k++; if (invParam.verbosity >= QUDA_VERBOSE){ printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2); } } if (x[0]->Precision() != x_sloppy[0]->Precision()) { for(i=0;i < num_offset; i++){ copyCuda(*x[i], *x_sloppy[i]); } } *residue_sq = r2; invParam.secs = stopwatchReadSeconds(); if (k==invParam.maxiter) { warningQuda("Exceeded maximum iterations %d\n", invParam.maxiter); } double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); invParam.gflops = gflops; invParam.iter = k; // Calculate the true residual of the system with the smallest shift mat(*r, *x[0]); axpyCuda(offset[0],*x[0], *r); // Offset it. double true_res = xmyNormCuda(b, *r); if (invParam.verbosity >= QUDA_SUMMARIZE){ printfQuda("MultiShift CG: Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", k,r2, (true_res / b2)); } if (invParam.verbosity >= QUDA_VERBOSE){ printfQuda("MultiShift CG: Converged after %d iterations\n", k); printfQuda(" shift=0 resid_rel=%e\n", sqrt(true_res/b2)); for(int i=1; i < num_offset; i++) { mat(*r, *x[i]); axpyCuda(offset[i],*x[i], *r); // Offset it. true_res = xmyNormCuda(b, *r); printfQuda(" shift=%d resid_rel=%e\n",i, sqrt(true_res/b2)); } } delete r; for(i=0;i < num_offset; i++){ delete p[i]; } delete p; delete Ap; if (invParam.cuda_prec_sloppy != x[0]->Precision()) { for(i=0;i < num_offset;i++){ delete x_sloppy[i]; } delete r_sloppy; } delete x_sloppy; delete []finished; delete []zeta_i; delete []zeta_im1; delete []zeta_ip1; delete []beta_i; delete []beta_im1; delete []alpha; }
void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b) { profile.Start(QUDA_PROFILE_INIT); int num_offset = param.num_offset; double *offset = param.offset; if (num_offset == 0) return; const double b2 = normCuda(b); // Check to see that we're not trying to invert on a zero-field source if(b2 == 0){ profile.Stop(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); for(int i=0; i<num_offset; ++i){ *(x[i]) = b; param.true_res_offset[i] = 0.0; param.true_res_hq_offset[i] = 0.0; } return; } double *zeta = new double[num_offset]; double *zeta_old = new double[num_offset]; double *alpha = new double[num_offset]; double *beta = new double[num_offset]; int j_low = 0; int num_offset_now = num_offset; for (int i=0; i<num_offset; i++) { zeta[i] = zeta_old[i] = 1.0; beta[i] = 0.0; alpha[i] = 1.0; } // flag whether we will be using reliable updates or not bool reliable = false; for (int j=0; j<num_offset; j++) if (param.tol_offset[j] < param.delta) reliable = true; cudaColorSpinorField *r = new cudaColorSpinorField(b); cudaColorSpinorField **y = reliable ? new cudaColorSpinorField*[num_offset] : NULL; ColorSpinorParam csParam(b); csParam.create = QUDA_ZERO_FIELD_CREATE; if (reliable) for (int i=0; i<num_offset; i++) y[i] = new cudaColorSpinorField(*r, csParam); csParam.setPrecision(param.precision_sloppy); cudaColorSpinorField *r_sloppy; if (param.precision_sloppy == x[0]->Precision()) { r_sloppy = r; } else { csParam.create = QUDA_COPY_FIELD_CREATE; r_sloppy = new cudaColorSpinorField(*r, csParam); } cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset]; if (param.precision_sloppy == x[0]->Precision() || !param.use_sloppy_partial_accumulator) { for (int i=0; i<num_offset; i++) x_sloppy[i] = x[i]; } else { csParam.create = QUDA_ZERO_FIELD_CREATE; for (int i=0; i<num_offset; i++) x_sloppy[i] = new cudaColorSpinorField(*x[i], csParam); } cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset]; for (int i=0; i<num_offset; i++) p[i]= new cudaColorSpinorField(*r_sloppy); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, csParam); cudaColorSpinorField tmp1(*Ap, csParam); // tmp2 only needed for multi-gpu Wilson-like kernels cudaColorSpinorField *tmp2_p = !mat.isStaggered() ? new cudaColorSpinorField(*Ap, csParam) : &tmp1; cudaColorSpinorField &tmp2 = *tmp2_p; // additional high-precision temporary if Wilson and mixed-precision csParam.setPrecision(param.precision); cudaColorSpinorField *tmp3_p = (param.precision != param.precision_sloppy && !mat.isStaggered()) ? new cudaColorSpinorField(*r, csParam) : &tmp1; cudaColorSpinorField &tmp3 = *tmp3_p; profile.Stop(QUDA_PROFILE_INIT); profile.Start(QUDA_PROFILE_PREAMBLE); // stopping condition of each shift double stop[QUDA_MAX_MULTI_SHIFT]; double r2[QUDA_MAX_MULTI_SHIFT]; for (int i=0; i<num_offset; i++) { r2[i] = b2; stop[i] = Solver::stopping(param.tol_offset[i], b2, param.residual_type); } double r2_old; double pAp; double rNorm[QUDA_MAX_MULTI_SHIFT]; double r0Norm[QUDA_MAX_MULTI_SHIFT]; double maxrx[QUDA_MAX_MULTI_SHIFT]; double maxrr[QUDA_MAX_MULTI_SHIFT]; for (int i=0; i<num_offset; i++) { rNorm[i] = sqrt(r2[i]); r0Norm[i] = rNorm[i]; maxrx[i] = rNorm[i]; maxrr[i] = rNorm[i]; } double delta = param.delta; // this parameter determines how many consective reliable update // reisudal increases we tolerate before terminating the solver, // i.e., how long do we want to keep trying to converge const int maxResIncrease = param.max_res_increase; // check if we reached the limit of our tolerance const int maxResIncreaseTotal = param.max_res_increase_total; int resIncrease = 0; int resIncreaseTotal[QUDA_MAX_MULTI_SHIFT]; for (int i=0; i<num_offset; i++) { resIncreaseTotal[i]=0; } int k = 0; int rUpdate = 0; quda::blas_flops = 0; profile.Stop(QUDA_PROFILE_PREAMBLE); profile.Start(QUDA_PROFILE_COMPUTE); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2)); while (r2[0] > stop[0] && k < param.maxiter) { matSloppy(*Ap, *p[0], tmp1, tmp2); // FIXME - this should be curried into the Dirac operator if (r->Nspin()==4) axpyCuda(offset[0], *p[0], *Ap); pAp = reDotProductCuda(*p[0], *Ap); // compute zeta and alpha updateAlphaZeta(alpha, zeta, zeta_old, r2, beta, pAp, offset, num_offset_now, j_low); r2_old = r2[0]; Complex cg_norm = axpyCGNormCuda(-alpha[j_low], *Ap, *r_sloppy); r2[0] = real(cg_norm); double zn = imag(cg_norm); // reliable update conditions rNorm[0] = sqrt(r2[0]); for (int j=1; j<num_offset_now; j++) rNorm[j] = rNorm[0] * zeta[j]; int updateX=0, updateR=0; int reliable_shift = -1; // this is the shift that sets the reliable_shift for (int j=num_offset_now-1; j>=0; j--) { if (rNorm[j] > maxrx[j]) maxrx[j] = rNorm[j]; if (rNorm[j] > maxrr[j]) maxrr[j] = rNorm[j]; updateX = (rNorm[j] < delta*r0Norm[j] && r0Norm[j] <= maxrx[j]) ? 1 : updateX; updateR = ((rNorm[j] < delta*maxrr[j] && r0Norm[j] <= maxrr[j]) || updateX) ? 1 : updateR; if ((updateX || updateR) && reliable_shift == -1) reliable_shift = j; } if ( !(updateR || updateX) || !reliable) { //beta[0] = r2[0] / r2_old; beta[0] = zn / r2_old; // update p[0] and x[0] axpyZpbxCuda(alpha[0], *p[0], *x_sloppy[0], *r_sloppy, beta[0]); for (int j=1; j<num_offset_now; j++) { beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]); // update p[i] and x[i] axpyBzpcxCuda(alpha[j], *p[j], *x_sloppy[j], zeta[j], *r_sloppy, beta[j]); } } else { for (int j=0; j<num_offset_now; j++) { axpyCuda(alpha[j], *p[j], *x_sloppy[j]); copyCuda(*x[j], *x_sloppy[j]); xpyCuda(*x[j], *y[j]); } mat(*r, *y[0], *x[0], tmp3); // here we can use x as tmp if (r->Nspin()==4) axpyCuda(offset[0], *y[0], *r); r2[0] = xmyNormCuda(b, *r); for (int j=1; j<num_offset_now; j++) r2[j] = zeta[j] * zeta[j] * r2[0]; for (int j=0; j<num_offset_now; j++) zeroCuda(*x_sloppy[j]); copyCuda(*r_sloppy, *r); // break-out check if we have reached the limit of the precision if (sqrt(r2[reliable_shift]) > r0Norm[reliable_shift]) { // reuse r0Norm for this resIncrease++; resIncreaseTotal[reliable_shift]++; warningQuda("MultiShiftCG: Shift %d, updated residual %e is greater than previous residual %e (total #inc %i)", reliable_shift, sqrt(r2[reliable_shift]), r0Norm[reliable_shift], resIncreaseTotal[reliable_shift]); if (resIncrease > maxResIncrease or resIncreaseTotal[reliable_shift] > maxResIncreaseTotal) break; // check if we reached the limit of our tolerancebreak; } else { resIncrease = 0; } // explicitly restore the orthogonality of the gradient vector for (int j=0; j<num_offset_now; j++) { double rp = reDotProductCuda(*r_sloppy, *p[j]) / (r2[0]); axpyCuda(-rp, *r_sloppy, *p[j]); } // update beta and p beta[0] = r2[0] / r2_old; xpayCuda(*r_sloppy, beta[0], *p[0]); for (int j=1; j<num_offset_now; j++) { beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]); axpbyCuda(zeta[j], *r_sloppy, beta[j], *p[j]); } // update reliable update parameters for the system that triggered the update int m = reliable_shift; rNorm[m] = sqrt(r2[0]) * zeta[m]; maxrr[m] = rNorm[m]; maxrx[m] = rNorm[m]; r0Norm[m] = rNorm[m]; rUpdate++; } // now we can check if any of the shifts have converged and remove them for (int j=1; j<num_offset_now; j++) { if (zeta[j] == 0.0) { num_offset_now--; if (getVerbosity() >= QUDA_VERBOSE) printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k + 1); } else { r2[j] = zeta[j] * zeta[j] * r2[0]; if (r2[j] < stop[j]) { num_offset_now--; if (getVerbosity() >= QUDA_VERBOSE) printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k+1); } } } k++; if (getVerbosity() >= QUDA_VERBOSE) printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2)); } for (int i=0; i<num_offset; i++) { copyCuda(*x[i], *x_sloppy[i]); if (reliable) xpyCuda(*y[i], *x[i]); } profile.Stop(QUDA_PROFILE_COMPUTE); profile.Start(QUDA_PROFILE_EPILOGUE); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("MultiShift CG: Reliable updates = %d\n", rUpdate); if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter); param.secs = profile.Last(QUDA_PROFILE_COMPUTE); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); param.gflops = gflops; param.iter += k; for(int i=0; i < num_offset; i++) { mat(*r, *x[i]); if (r->Nspin()==4) { axpyCuda(offset[i], *x[i], *r); // Offset it. } else if (i!=0) { axpyCuda(offset[i]-offset[0], *x[i], *r); // Offset it. } double true_res = xmyNormCuda(b, *r); param.true_res_offset[i] = sqrt(true_res/b2); #if (__COMPUTE_CAPABILITY__ >= 200) param.true_res_hq_offset[i] = sqrt(HeavyQuarkResidualNormCuda(*x[i], *r).z); #else param.true_res_hq_offset[i] = 0.0; #endif } if (getVerbosity() >= QUDA_SUMMARIZE){ printfQuda("MultiShift CG: Converged after %d iterations\n", k); for(int i=0; i < num_offset; i++) { printfQuda(" shift=%d, relative residual: iterated = %e, true = %e\n", i, sqrt(r2[i]/b2), param.true_res_offset[i]); } } // reset the flops counters quda::blas_flops = 0; mat.flops(); matSloppy.flops(); profile.Stop(QUDA_PROFILE_EPILOGUE); profile.Start(QUDA_PROFILE_FREE); if (&tmp3 != &tmp1) delete tmp3_p; if (&tmp2 != &tmp1) delete tmp2_p; if (r_sloppy->Precision() != r->Precision()) delete r_sloppy; for (int i=0; i<num_offset; i++) if (x_sloppy[i]->Precision() != x[i]->Precision()) delete x_sloppy[i]; delete []x_sloppy; delete r; for (int i=0; i<num_offset; i++) delete p[i]; delete []p; if (reliable) { for (int i=0; i<num_offset; i++) delete y[i]; delete []y; } delete Ap; delete []zeta_old; delete []zeta; delete []alpha; delete []beta; profile.Stop(QUDA_PROFILE_FREE); return; }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { profile.Start(QUDA_PROFILE_INIT); // Check to see that we're not trying to invert on a zero-field source const double b2 = norm2(b); if(b2 == 0){ profile.Stop(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x=b; param.true_res = 0.0; param.true_res_hq = 0.0; return; } cudaColorSpinorField r(b); ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, csParam); mat(r, x, y); // zeroCuda(y); double r2 = xmyNormCuda(b, r); csParam.setPrecision(param.precision_sloppy); cudaColorSpinorField Ap(x, csParam); cudaColorSpinorField tmp(x, csParam); cudaColorSpinorField *tmp2_p = &tmp; // tmp only needed for multi-gpu Wilson-like kernels if (mat.Type() != typeid(DiracStaggeredPC).name() && mat.Type() != typeid(DiracStaggered).name()) { tmp2_p = new cudaColorSpinorField(x, csParam); } cudaColorSpinorField &tmp2 = *tmp2_p; cudaColorSpinorField *x_sloppy, *r_sloppy; if (param.precision_sloppy == x.Precision()) { csParam.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { csParam.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, csParam); r_sloppy = new cudaColorSpinorField(r, csParam); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); if(&x != &xSloppy){ copyCuda(y,x); zeroCuda(xSloppy); }else{ zeroCuda(y); } const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; profile.Stop(QUDA_PROFILE_INIT); profile.Start(QUDA_PROFILE_PREAMBLE); double r2_old; double stop = b2*param.tol*param.tol; // stopping condition of solver double heavy_quark_res = 0.0; // heavy quark residual if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z); int heavy_quark_check = 10; // how often to check the heavy quark residual double alpha=0.0, beta=0.0; double pAp; int rUpdate = 0; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = param.delta; // this parameter determines how many consective reliable update // reisudal increases we tolerate before terminating the solver, // i.e., how long do we want to keep trying to converge int maxResIncrease = 0; // 0 means we have no tolerance profile.Stop(QUDA_PROFILE_PREAMBLE); profile.Start(QUDA_PROFILE_COMPUTE); blas_flops = 0; int k=0; PrintStats("CG", k, r2, b2, heavy_quark_res); int steps_since_reliable = 1; while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp double sigma; bool breakdown = false; if (param.pipeline) { double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p); r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z; r2_old = r2; alpha = r2 / pAp; sigma = alpha*(alpha * Ap2 - pAp); if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down r2 = axpyNormCuda(-alpha, Ap, rSloppy); sigma = r2; breakdown = true; } r2 = sigma; } else { r2_old = r2; pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; // here we are deploying the alternative beta computation Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy); r2 = real(cg_norm); // (r_new, r_new) sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks } // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; // force a reliable update if we are within target tolerance (only if doing reliable updates) if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1; if ( !(updateR || updateX)) { //beta = r2 / r2_old; beta = sigma / r2_old; // use the alternative beta computation if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, rSloppy, xSloppy, p); else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); if (use_heavy_quark_res && k%heavy_quark_check==0) { copyCuda(tmp,y); heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z); } steps_since_reliable++; } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); // break-out check if we have reached the limit of the precision static int resIncrease = 0; if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm); k++; rUpdate++; if (++resIncrease > maxResIncrease) break; } else { resIncrease = 0; } rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; // explicitly restore the orthogonality of the gradient vector double rp = reDotProductCuda(rSloppy, p) / (r2); axpyCuda(-rp, rSloppy, p); beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y,r).z); steps_since_reliable = 0; } breakdown = false; k++; PrintStats("CG", k, r2, b2, heavy_quark_res); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); profile.Stop(QUDA_PROFILE_COMPUTE); profile.Start(QUDA_PROFILE_EPILOGUE); param.secs = profile.Last(QUDA_PROFILE_COMPUTE); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); param.gflops = gflops; param.iter += k; if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CG: Reliable updates = %d\n", rUpdate); // compute the true residuals mat(r, x, y); param.true_res = sqrt(xmyNormCuda(b, r) / b2); #if (__COMPUTE_CAPABILITY__ >= 200) param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z); #else param.true_res_hq = 0.0; #endif PrintSummary("CG", k, r2, b2); // reset the flops counters quda::blas_flops = 0; mat.flops(); matSloppy.flops(); profile.Stop(QUDA_PROFILE_EPILOGUE); profile.Start(QUDA_PROFILE_FREE); if (&tmp2 != &tmp) delete tmp2_p; if (param.precision_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } profile.Stop(QUDA_PROFILE_FREE); return; }
void PreconCG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { profile.Start(QUDA_PROFILE_INIT); // Check to see that we're not trying to invert on a zero-field source const double b2 = norm2(b); if(b2 == 0){ profile.Stop(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x=b; param.true_res = 0.0; param.true_res_hq = 0.0; } int k=0; int rUpdate=0; cudaColorSpinorField* minvrPre; cudaColorSpinorField* rPre; cudaColorSpinorField* minvr; cudaColorSpinorField* minvrSloppy; cudaColorSpinorField* p; ColorSpinorParam csParam(b); cudaColorSpinorField r(b); if(K) minvr = new cudaColorSpinorField(b); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b,csParam); mat(r, x, y); // => r = A*x; double r2 = xmyNormCuda(b,r); csParam.setPrecision(param.precision_sloppy); cudaColorSpinorField tmpSloppy(x,csParam); cudaColorSpinorField Ap(x,csParam); cudaColorSpinorField *r_sloppy; if(param.precision_sloppy == x.Precision()) { r_sloppy = &r; minvrSloppy = minvr; }else{ csParam.create = QUDA_COPY_FIELD_CREATE; r_sloppy = new cudaColorSpinorField(r,csParam); if(K) minvrSloppy = new cudaColorSpinorField(*minvr,csParam); } cudaColorSpinorField *x_sloppy; if(param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) { csParam.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; }else{ csParam.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x,csParam); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; if(&x != &xSloppy){ copyCuda(y, x); // copy x to y zeroCuda(xSloppy); }else{ zeroCuda(y); // no reliable updates // NB: check this } const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; if(K){ csParam.create = QUDA_COPY_FIELD_CREATE; csParam.setPrecision(param.precision_precondition); rPre = new cudaColorSpinorField(rSloppy,csParam); // Create minvrPre minvrPre = new cudaColorSpinorField(*rPre); globalReduce = false; (*K)(*minvrPre, *rPre); globalReduce = true; *minvrSloppy = *minvrPre; p = new cudaColorSpinorField(*minvrSloppy); }else{ p = new cudaColorSpinorField(rSloppy); } profile.Stop(QUDA_PROFILE_INIT); profile.Start(QUDA_PROFILE_PREAMBLE); double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver double heavy_quark_res = 0.0; // heavy quark residual if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z); int heavy_quark_check = 10; // how often to check the heavy quark residual double alpha = 0.0, beta=0.0; double pAp; double rMinvr = 0; double rMinvr_old = 0.0; double r_new_Minvr_old = 0.0; double r2_old = 0; r2 = norm2(r); double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = param.delta; if(K) rMinvr = reDotProductCuda(rSloppy,*minvrSloppy); profile.Stop(QUDA_PROFILE_PREAMBLE); profile.Start(QUDA_PROFILE_COMPUTE); quda::blas_flops = 0; int steps_since_reliable = 1; const int maxResIncrease = 0; while(!convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter){ matSloppy(Ap, *p, tmpSloppy); double sigma; bool breakdown = false; pAp = reDotProductCuda(*p,Ap); alpha = (K) ? rMinvr/pAp : r2/pAp; Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy); // r --> r - alpha*A*p r2_old = r2; r2 = real(cg_norm); sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k-1 - r_k) breaks if(K) rMinvr_old = rMinvr; rNorm = sqrt(r2); if(rNorm > maxrx) maxrx = rNorm; if(rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; // force a reliable update if we are within target tolerance (only if doing reliable updates) if( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1; if( !(updateR || updateX) ){ if(K){ r_new_Minvr_old = reDotProductCuda(rSloppy,*minvrSloppy); *rPre = rSloppy; globalReduce = false; (*K)(*minvrPre, *rPre); globalReduce = true; *minvrSloppy = *minvrPre; rMinvr = reDotProductCuda(rSloppy,*minvrSloppy); beta = (rMinvr - r_new_Minvr_old)/rMinvr_old; axpyZpbxCuda(alpha, *p, xSloppy, *minvrSloppy, beta); }else{ beta = sigma/r2_old; // use the alternative beta computation axpyZpbxCuda(alpha, *p, xSloppy, rSloppy, beta); } } else { // reliable update axpyCuda(alpha, *p, xSloppy); // xSloppy += alpha*p copyCuda(x, xSloppy); xpyCuda(x, y); // y += x // Now compute r mat(r, y, x); // x is just a temporary here r2 = xmyNormCuda(b, r); copyCuda(rSloppy, r); // copy r to rSloppy zeroCuda(xSloppy); // break-out check if we have reached the limit of the precision static int resIncrease = 0; if(sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this warningQuda("PCG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm); k++; rUpdate++; if(++resIncrease > maxResIncrease) break; }else{ resIncrease = 0; } rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; ++rUpdate; if(K){ *rPre = rSloppy; globalReduce = false; (*K)(*minvrPre, *rPre); globalReduce = true; *minvrSloppy = *minvrPre; rMinvr = reDotProductCuda(rSloppy,*minvrSloppy); beta = rMinvr/rMinvr_old; xpayCuda(*minvrSloppy, beta, *p); // p = minvrSloppy + beta*p }else{ // standard CG - no preconditioning // explicitly restore the orthogonality of the gradient vector double rp = reDotProductCuda(rSloppy, *p)/(r2); axpyCuda(-rp, rSloppy, *p); beta = r2/r2_old; xpayCuda(rSloppy, beta, *p); steps_since_reliable = 0; } } breakdown = false; ++k; PrintStats("PCG", k, r2, b2, heavy_quark_res); } profile.Stop(QUDA_PROFILE_COMPUTE); profile.Start(QUDA_PROFILE_EPILOGUE); if(x.Precision() != param.precision_sloppy) copyCuda(x, xSloppy); xpyCuda(y, x); // x += y param.secs = profile.Last(QUDA_PROFILE_COMPUTE); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9; reduceDouble(gflops); param.gflops = gflops; param.iter += k; if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CG: Reliable updates = %d\n", rUpdate); // compute the true residual mat(r, x, y); double true_res = xmyNormCuda(b, r); param.true_res = sqrt(true_res / b2); // reset the flops counters quda::blas_flops = 0; mat.flops(); matSloppy.flops(); matPrecon.flops(); profile.Stop(QUDA_PROFILE_EPILOGUE); profile.Start(QUDA_PROFILE_FREE); if(K){ // These are only needed if preconditioning is used delete minvrPre; delete rPre; delete minvr; if(x.Precision() != param.precision_sloppy) delete minvrSloppy; } delete p; if(x.Precision() != param.precision_sloppy){ delete x_sloppy; delete r_sloppy; } profile.Stop(QUDA_PROFILE_FREE); return; }