Exemplo n.º 1
0
  /*
   * Read tunecache from disk.
   */
  void loadTuneCache(QudaVerbosity verbosity)
  {
    char *path;
    struct stat pstat;
    std::string cache_path, line, token;
    std::ifstream cache_file;
    std::stringstream ls;

    path = getenv("QUDA_RESOURCE_PATH");
    if (!path) {
      warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
      warningQuda("Caching of tuned parameters will be disabled.");
      return;
    } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
      warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path); 
      warningQuda("Caching of tuned parameters will be disabled.");
      return;
    } else {
      resource_path = path;
    }

#ifdef MULTI_GPU
    if (comm_rank() == 0) {
#endif

      cache_path = resource_path;
      cache_path += "/tunecache.tsv";
      cache_file.open(cache_path.c_str());

      if (cache_file) {

	if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
	getline(cache_file, line);
	ls.str(line);
	ls >> token;
	if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
	ls >> token;
	if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str());
	ls >> token;
	if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str());
      
	if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
	getline(cache_file, line); // eat the blank line
      
	if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
	getline(cache_file, line); // eat the description line
      
	deserializeTuneCache(cache_file);
	cache_file.close();      
	initial_cache_size = tunecache.size();

	if (verbosity >= QUDA_SUMMARIZE) {
	  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
	}
      
      } else {
Exemplo n.º 2
0
  void TimeProfile::PrintGlobal() {
    if (global_profile[QUDA_PROFILE_TOTAL].time > 0.0) {
      printfQuda("\n   %20s Total time = %g secs\n", "QUDA",
                 global_profile[QUDA_PROFILE_TOTAL].time);
    }

    double accounted = 0.0;
    bool print_timer = true; // whether to print that timer
    for (int i=0; i<QUDA_PROFILE_COUNT-1; i++) {
      if (i==QUDA_PROFILE_LOWER_LEVEL) print_timer=false; // we do not want to print detailed lower level timers
      if (global_profile[i].count > 0) {
        if (print_timer) printfQuda("     %17s     = %f secs (%6.3g%%), with %8d calls at %e us per call\n",
                   (const char*)&pname[i][0],  global_profile[i].time,
                   100*global_profile[i].time/global_profile[QUDA_PROFILE_TOTAL].time,
                   global_profile[i].count, 1e6*global_profile[i].time/global_profile[i].count);
        accounted += global_profile[i].time;
      }
    }
    if (accounted > 0.0) {
      double missing = global_profile[QUDA_PROFILE_TOTAL].time - accounted;
      printfQuda("     total accounted       = %f secs (%6.3g%%)\n",
                 accounted, 100*accounted/global_profile[QUDA_PROFILE_TOTAL].time);
      printfQuda("     total missing         = %f secs (%6.3g%%)\n",
                 missing, 100*missing/global_profile[QUDA_PROFILE_TOTAL].time);
    }

    if (accounted > global_profile[QUDA_PROFILE_TOTAL].time) {
      warningQuda("Accounted time %f secs in %s is greater than total time %f secs\n",
                  accounted, "QUDA", global_profile[QUDA_PROFILE_TOTAL].time);
    }

  }
Exemplo n.º 3
0
  /**< Print out the profile information */
  void TimeProfile::Print() {
    if (profile[QUDA_PROFILE_TOTAL].time > 0.0) {
      printfQuda("\n   %20s Total time = %g secs\n", fname.c_str(), 
		 profile[QUDA_PROFILE_TOTAL].time);
    }

    double accounted = 0.0;
    for (int i=0; i<QUDA_PROFILE_COUNT-1; i++) {
      if (profile[i].count > 0) {
	printfQuda("     %17s     = %f secs (%6.3g%%), with %8d calls at %e us per call\n", 
		   (const char*)&pname[i][0],  profile[i].time, 
		   100*profile[i].time/profile[QUDA_PROFILE_TOTAL].time,
		   profile[i].count, 1e6*profile[i].time/profile[i].count);
	accounted += profile[i].time;
      }
    }
    if (accounted > 0.0) {
      double missing = profile[QUDA_PROFILE_TOTAL].time - accounted;
      printfQuda("     total accounted       = %f secs (%6.3g%%)\n", 
		 accounted, 100*accounted/profile[QUDA_PROFILE_TOTAL].time);
      printfQuda("     total missing         = %f secs (%6.3g%%)\n", 
		 missing, 100*missing/profile[QUDA_PROFILE_TOTAL].time);
    }

    if (accounted > profile[QUDA_PROFILE_TOTAL].time) {
      warningQuda("Accounted time %f secs in %s is greater than total time %f secs\n", 
		  accounted, (const char*)&fname[0], profile[QUDA_PROFILE_TOTAL].time);
    }
    
  }
Exemplo n.º 4
0
static int
process_core_string_list(const char* _str, int* list, int* ncores)
{
  /* The input string @str should be separated by comma, and each item can be 
   * either a number or a range (see the comments in process_core_string_item 
   * function)
   *
   */

  if(_str == NULL || list == NULL || ncores == NULL
     || *ncores <= 0){
    warningQuda("Bad argument");
    return  -1;
  }

  char str[256];
  strncpy(str, _str, sizeof(str));

  int left_space = *ncores;
  int tot_cores = 0;

  char* item = strtok(str, ",");
  if(item == NULL){
    warningQuda("Invalid string format (%s)", str);
    return -1;
  }
  
  do {
    int sub_ncores = left_space;
    int* sub_list = list + tot_cores;
    
    int rc = process_core_string_item(item, sub_list, &sub_ncores);
    if(rc <0){
      warningQuda("Processing item (%s) failed", item);
      return -1;
    }

    tot_cores += sub_ncores;
    left_space -= sub_ncores;

    item = strtok(NULL, ",");
  }while( item != NULL);

  *ncores = tot_cores;
  return 0;
}
Exemplo n.º 5
0
static int 
process_core_string_item(const char* str, int* sub_list, int* sub_ncores)
{
  /* assume the input format is one of the following two
   * 1. a number only, e.g. 5
   * 2. a range, e.g 4-6, which means three numbers 4,5,6
   * return a list of numbers in @sub_list and and the total numbers
   * in @sub_ncores
   */
  int i;
  if(str == NULL || sub_list == NULL || sub_ncores == NULL ||
     *sub_ncores <= 0){
    warningQuda("Bad argument");
    return -1;
  }

  if(strstr(str, "-") != NULL){
    //a range
    int low_core, high_core;
    if (sscanf(str,"%d-%d",&low_core, &high_core) != 2){
      warningQuda("Range scan failed");
      return -1;
    }
    if(*sub_ncores <  high_core-low_core +1){
      warningQuda("Not enough space in sub_list");
      return -1;
    }
    
    for(i = 0; i < high_core-low_core +1; i++){
      sub_list[i] = i + low_core;
    }
    *sub_ncores =  high_core - low_core +1;

  }else{
    //a number
    int core;
    if (sscanf(str, "%d", &core) != 1){
      warningQuda("Wrong format for core number");
      return -1;
    }
    sub_list[0] = core;
    *sub_ncores   =1;
  }
  return 0;
}
Exemplo n.º 6
0
static int 
getNumaAffinity(int my_gpu, int *cpu_cores, int* ncores)
{
  FILE *nvidia_info, *pci_bus_info;
  size_t nbytes = 255;
  char *my_line;
  char nvidia_info_path[255], pci_bus_info_path[255];
  char bus_info[255];
  
  // the nvidia driver populates this path for each gpu
  sprintf(nvidia_info_path,"/proc/driver/nvidia/gpus/%d/information", my_gpu);
  nvidia_info= fopen(nvidia_info_path,"r");
  if (nvidia_info == NULL){
    return -1;
  }
  
  my_line= (char *) safe_malloc(nbytes +1);
  
  while (!feof(nvidia_info)){
    if ( -1 == getline(&my_line, &nbytes, nvidia_info)){
      break;
    }else{
      // the first 7 char of the Bus Location will lead to the corresponding
      // path under /sys/class/pci_bus/  , cpulistaffinity showing cores on that
      // bus is located there
      if ( 1 == sscanf(my_line,"Bus Location: %s", bus_info )){
	sprintf(pci_bus_info_path,"/sys/class/pci_bus/%.7s/cpulistaffinity",
		bus_info);
      }
    }
  }
  // open the cpulistaffinity file on the pci_bus for "my_gpu"
  pci_bus_info= fopen(pci_bus_info_path,"r");
  if (pci_bus_info == NULL){
    //printfQuda("Warning: opening file %s failed\n", pci_bus_info_path);
    host_free(my_line);
    fclose(nvidia_info);
    return -1;
  }
  
  while (!feof(pci_bus_info)){
    if ( -1 == getline(&my_line, &nbytes, pci_bus_info)){
      break;
    } else{
      int rc = process_core_string_list(my_line, cpu_cores, ncores);
      if(rc < 0){
	warningQuda("Failed to process the line \"%s\"", my_line);
	host_free(my_line);
	fclose(nvidia_info);
	return  -1;
      }
    }
  }
  
  host_free(my_line);
  return 0;
}
Exemplo n.º 7
0
void pushVerbosity(QudaVerbosity verbosity)
{
  vstack.push(getVerbosity());
  setVerbosity(verbosity);

  if (vstack.size() > 10) {
    warningQuda("Verbosity stack contains %u elements.  Is there a missing popVerbosity() somewhere?",
		static_cast<unsigned int>(vstack.size()));
  }
}
Exemplo n.º 8
0
int 
setNumaAffinity(int devid)
{
  int cpu_cores[128];
  int ncores=128;
  int rc = getNumaAffinity(devid, cpu_cores, &ncores);
  if(rc != 0){
    warningQuda("Failed to determine NUMA affinity for device %d (possibly not applicable)", devid);
    return 1;
  }
  int which = devid % ncores;
  printfQuda("Setting NUMA affinity for device %d to CPU core %d\n", devid, cpu_cores[which]);
/*
  for(int i=0;i < ncores;i++){
   if (i != which ) continue;
    printfQuda("%d", cpu_cores[i]);
    if((i+1) < ncores){
      printfQuda(",");
    }
  }
  printfQuda("\n");
  */

  cpu_set_t cpu_set;
  CPU_ZERO(&cpu_set);
  
  for(int i=0;i < ncores;i++){
    if( i != which) continue;
    CPU_SET(cpu_cores[i], &cpu_set);
  }
  
  rc = sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set);
  if (rc != 0){
    warningQuda("Failed to enforce NUMA affinity (probably due to lack of kernel support)");
    return -1;
  }
  
  
  return 0;
}
Exemplo n.º 9
0
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
{
  int k=0;
  int rUpdate = 0;
    
  cudaColorSpinorField r(b);

  ColorSpinorParam param(x);
  param.create = QUDA_ZERO_FIELD_CREATE;
  cudaColorSpinorField y(b, param); 
  
  mat(r, x, y);
  zeroCuda(y);

  double r2 = xmyNormCuda(b, r);
  rUpdate ++;
  
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField Ap(x, param);
  cudaColorSpinorField tmp(x, param);
  cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass

  cudaColorSpinorField *x_sloppy, *r_sloppy;
  if (invParam.cuda_prec_sloppy == x.Precision()) {
    param.create = QUDA_REFERENCE_FIELD_CREATE;
    x_sloppy = &x;
    r_sloppy = &r;
  } else {
    param.create = QUDA_COPY_FIELD_CREATE;
    x_sloppy = new cudaColorSpinorField(x, param);
    r_sloppy = new cudaColorSpinorField(r, param);
  }

  cudaColorSpinorField &xSloppy = *x_sloppy;
  cudaColorSpinorField &rSloppy = *r_sloppy;

  cudaColorSpinorField p(rSloppy);

  double r2_old;
  double src_norm = norm2(b);
  double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver

  double alpha, beta;
  double pAp;

  double rNorm = sqrt(r2);
  double r0Norm = rNorm;
  double maxrx = rNorm;
  double maxrr = rNorm;
  double delta = invParam.reliable_delta;

  if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2);

  quda::blas_flops = 0;

  stopwatchStart();
  while (r2 > stop && k<invParam.maxiter) {

    matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
    pAp = reDotProductCuda(p, Ap);
    alpha = r2 / pAp;        
    r2_old = r2;
    r2 = axpyNormCuda(-alpha, Ap, rSloppy);

    // reliable update conditions
    rNorm = sqrt(r2);
    if (rNorm > maxrx) maxrx = rNorm;
    if (rNorm > maxrr) maxrr = rNorm;
    int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
    int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
    if (!(updateR || updateX)) {
      beta = r2 / r2_old;
      axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);
    } else {
      axpyCuda(alpha, p, xSloppy);
      if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
      xpyCuda(x, y); // swap these around?
      mat(r, y, x); // here we can use x as tmp
      r2 = xmyNormCuda(b, r);
      if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
      zeroCuda(xSloppy);

      rNorm = sqrt(r2);
      maxrr = rNorm;
      maxrx = rNorm;
      r0Norm = rNorm;      
      rUpdate++;

      beta = r2 / r2_old; 
      xpayCuda(rSloppy, beta, p);
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE)
      printfQuda("CG: %d iterations, r2 = %e\n", k, r2);
  }

  if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
  xpyCuda(y, x);

  invParam.secs = stopwatchReadSeconds();

  
  if (k==invParam.maxiter) 
    warningQuda("Exceeded maximum iterations %d", invParam.maxiter);

  if (invParam.verbosity >= QUDA_SUMMARIZE)
    printfQuda("CG: Reliable updates = %d\n", rUpdate);

  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  //  printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
  invParam.gflops = gflops;
  invParam.iter = k;

  quda::blas_flops = 0;

  if (invParam.verbosity >= QUDA_SUMMARIZE){
    mat(r, x, y);
    double true_res = xmyNormCuda(b, r);
    printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", 
	       k, sqrt(r2/src_norm), sqrt(true_res / src_norm));    
  }

  if (invParam.cuda_prec_sloppy != x.Precision()) {
    delete r_sloppy;
    delete x_sloppy;
  }

  return;
}
Exemplo n.º 10
0
void qudaSetNumaConfig(char* filename)
{
  static int already_set = 0;
  if(already_set){
	return;
  }
  already_set =1;

  if(filename ==NULL){
    errorQuda("numa config filename is NULL\n");
  }
  if(strlen(filename) >= 128){
    errorQuda("numa config filename too long\n");
  }
  
  FILE* fd = fopen(filename, "r");
  if (fd == NULL){
    warningQuda("opening numa config file(%s) failed",filename );
    return;
  }
  
  for(int i=0;i < MAX_GPU_NUM_PER_NODE; i++){
    gpu_affinity[i] = -1;
  }


  char buf[1024];
  while ( fgets(buf, 1024, fd) != NULL){
    if (buf[0]== '\n' || buf[0] == '#'){
      continue;
    }
    
    char* token[4];
    token[0] = (char*)strtok(buf, " \t\n");
    token[1] = (char*)strtok(NULL, " \t\n");
    token[2] = (char*)strtok(NULL, " \t\n");
    token[3] = (char*)strtok(NULL, " \t\n");
    
    if(strcmp(token[0], "affinity") != 0){
      warningQuda("Invalid format for the numa config file\n");
      fclose(fd);
      return ;
    }

    if (token[1] == NULL || token[2] == NULL){
      warningQuda("invalid entry for affinity\n");
      fclose(fd);
      return;
    }
    int gpunum = atoi(token[1]);
    int nodenum = atoi(token[2]);
    if(gpunum < 0 ||nodenum < 0){
      warningQuda("Invalid gpunum(%d) or nodenum(%d)\n", gpunum, nodenum);
      fclose(fd);
      return;
    }
    gpu_affinity[gpunum] = nodenum;
  }
  
  fclose(fd);
  
  numa_config_set = 1;
  
  return;
}
Exemplo n.º 11
0
void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param)
{

  if (!h_clover && !h_clovinv) {
    errorQuda("loadCloverQuda() called with neither clover term nor inverse");
  }
  if (inv_param->clover_cpu_prec == QUDA_HALF_PRECISION) {
    errorQuda("Half precision not supported on CPU");
  }
  if (gaugePrecise == NULL) {
    errorQuda("Gauge field must be loaded before clover");
  }
  if (inv_param->dslash_type != QUDA_CLOVER_WILSON_DSLASH) {
    errorQuda("Wrong dslash_type in loadCloverQuda()");
  }

  // determines whether operator is preconditioned when calling invertQuda()
  bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE ||
		   inv_param->solve_type == QUDA_NORMEQ_PC_SOLVE);

  // determines whether operator is preconditioned when calling MatQuda() or MatDagMatQuda()
  bool pc_solution = (inv_param->solution_type == QUDA_MATPC_SOLUTION ||
		      inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION);

  bool asymmetric = (inv_param->matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC ||
		     inv_param->matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC);

  // We issue a warning only when it seems likely that the user is screwing up:

  // inverted clover term is required when applying preconditioned operator
  if (!h_clovinv && pc_solve && pc_solution) {
    warningQuda("Inverted clover term not loaded");
  }

  // uninverted clover term is required when applying unpreconditioned operator,
  // but note that dslashQuda() is always preconditioned
  if (!h_clover && !pc_solve && !pc_solution) {
    //warningQuda("Uninverted clover term not loaded");
  }

  // uninverted clover term is also required for "asymmetric" preconditioning
  if (!h_clover && pc_solve && pc_solution && asymmetric) {
    warningQuda("Uninverted clover term not loaded");
  }

  CloverFieldParam clover_param;
  clover_param.nDim = 4;
  for (int i=0; i<4; i++) clover_param.x[i] = gaugePrecise->X()[i];
  clover_param.precision = inv_param->clover_cuda_prec;
  clover_param.pad = inv_param->cl_pad;

  cloverPrecise = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 
				      inv_param->clover_order, clover_param);
  inv_param->cloverGiB = cloverPrecise->GBytes();
  
  if (inv_param->clover_cuda_prec != inv_param->clover_cuda_prec_sloppy) {
    clover_param.precision = inv_param->clover_cuda_prec_sloppy;
    cloverSloppy = new cudaCloverField(h_clover, h_clovinv, inv_param->clover_cpu_prec, 
				       inv_param->clover_order, clover_param); 
    inv_param->cloverGiB += cloverSloppy->GBytes();
  } else {
    cloverSloppy = cloverPrecise;
  }

  endInvertQuda(); // need to delete any persistant dirac operators
}
Exemplo n.º 12
0
  void CG3::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
  {

    // Check to see that we're not trying to invert on a zero-field source    
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.TPSTOP(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
      return;
    }

    ColorSpinorParam csParam(x);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
  
    
    cudaColorSpinorField x_prev(b, csParam);  
    cudaColorSpinorField r_prev(b, csParam);
    cudaColorSpinorField temp(b, csParam);

    cudaColorSpinorField r(b);
    cudaColorSpinorField w(b);


    mat(r, x, temp);  // r = Mx
    double r2 = xmyNormCuda(b,r); // r = b - Mx
    PrintStats("CG3", 0, r2, b2, 0.0);


    double stop = stopping(param.tol, b2, param.residual_type);
    if(convergence(r2, 0.0, stop, 0.0)) return;
    // First iteration 
    mat(w, r, temp);
    double rAr = reDotProductCuda(r,w);
    double rho = 1.0;
    double gamma_prev = 0.0;
    double gamma = r2/rAr;


    cudaColorSpinorField x_new(x);
    cudaColorSpinorField r_new(r);
    axpyCuda(gamma, r, x_new);  // x_new += gamma*r
    axpyCuda(-gamma, w, r_new); // r_new -= gamma*w
    // end of first iteration  

    // axpbyCuda(a,b,x,y) => y = a*x + b*y

    int k = 1; // First iteration performed above

    double r2_prev;
    while(!convergence(r2, 0.0, stop, 0.0) && k<param.maxiter){
      x_prev = x; x = x_new;
      r_prev = r; r = r_new;
      mat(w, r, temp);
      rAr = reDotProductCuda(r,w);
      r2_prev = r2;
      r2 = norm2(r);

      // Need to rearrange this!
      PrintStats("CG3", k, r2, b2, 0.0);

      gamma_prev = gamma;
      gamma = r2/rAr;
      rho = 1.0/(1. - (gamma/gamma_prev)*(r2/r2_prev)*(1.0/rho));
      
      x_new = x;
      axCuda(rho,x_new); 
      axpyCuda(rho*gamma,r,x_new);
      axpyCuda((1. - rho),x_prev,x_new);

      r_new = r;
      axCuda(rho,r_new);
      axpyCuda(-rho*gamma,w,r_new);
      axpyCuda((1.-rho),r_prev,r_new);


       double rr_old = reDotProductCuda(r_new,r);
      printfQuda("rr_old = %1.14lf\n", rr_old);


 
      k++;
    }


    if(k == param.maxiter)
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    // compute the true residual
    mat(r, x, temp);
    param.true_res = sqrt(xmyNormCuda(b, r)/b2);

    PrintSummary("CG3", k, r2, b2);

    return;
  }
Exemplo n.º 13
0
void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b)
{
 
  int num_offset = invParam.num_offset;
  double *offset = invParam.offset;
  double *residue_sq = invParam.tol_offset;
 
  if (num_offset == 0) return;

  int *finished = new int [num_offset];
  double *zeta_i = new double[num_offset];
  double *zeta_im1 = new double[num_offset];
  double *zeta_ip1 = new double[num_offset];
  double *beta_i = new double[num_offset];
  double *beta_im1 = new double[num_offset];
  double *alpha = new double[num_offset];
  int i, j;
  
  int j_low = 0;   
  int num_offset_now = num_offset;
  for (i=0; i<num_offset; i++) {
    finished[i] = 0;
    zeta_im1[i] = zeta_i[i] = 1.0;
    beta_im1[i] = -1.0;
    alpha[i] = 0.0;
  }
  
  //double msq_x4 = offset[0];

  cudaColorSpinorField *r = new cudaColorSpinorField(b);
  
  cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset], *r_sloppy;
  
  ColorSpinorParam param;
  param.create = QUDA_ZERO_FIELD_CREATE;
  param.precision = invParam.cuda_prec_sloppy;
  
  if (invParam.cuda_prec_sloppy == x[0]->Precision()) {
    for (i=0; i<num_offset; i++){
      x_sloppy[i] = x[i];
      zeroCuda(*x_sloppy[i]);
    }
    r_sloppy = r;
  } else {
    for (i=0; i<num_offset; i++) {
      x_sloppy[i] = new cudaColorSpinorField(*x[i], param);
    }
    param.create = QUDA_COPY_FIELD_CREATE;
    r_sloppy = new cudaColorSpinorField(*r, param);
  }
  
  cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];  
  for(i=0;i < num_offset;i++){
    p[i]= new cudaColorSpinorField(*r_sloppy);    
  }
  
  param.create = QUDA_ZERO_FIELD_CREATE;
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, param);
  
  double b2 = 0.0;
  b2 = normCuda(b);
    
  double r2 = b2;
  double r2_old;
  double stop = r2*invParam.tol*invParam.tol; // stopping condition of solver
    
  double pAp;
    
  int k = 0;
    
  stopwatchStart();
  while (r2 > stop &&  k < invParam.maxiter) {
    //dslashCuda_st(tmp_sloppy, fatlinkSloppy, longlinkSloppy, p[0], 1 - oddBit, 0);
    //dslashAxpyCuda(Ap, fatlinkSloppy, longlinkSloppy, tmp_sloppy, oddBit, 0, p[0], msq_x4);
    matSloppy(*Ap, *p[0]);
    if (invParam.dslash_type != QUDA_ASQTAD_DSLASH){
      axpyCuda(offset[0], *p[0], *Ap);
    }
    pAp = reDotProductCuda(*p[0], *Ap);
    beta_i[0] = r2 / pAp;        

    zeta_ip1[0] = 1.0;
    for (j=1; j<num_offset_now; j++) {
      zeta_ip1[j] = zeta_i[j] * zeta_im1[j] * beta_im1[j_low];
      double c1 = beta_i[j_low] * alpha[j_low] * (zeta_im1[j]-zeta_i[j]);
      double c2 = zeta_im1[j] * beta_im1[j_low] * (1.0+(offset[j]-offset[0])*beta_i[j_low]);
      /*THISBLOWSUP
	zeta_ip1[j] /= c1 + c2;
	beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
      */
      /*TRYTHIS*/
      if( (c1+c2) != 0.0 )
	zeta_ip1[j] /= (c1 + c2); 
      else {
	zeta_ip1[j] = 0.0;
	finished[j] = 1;
      }
      if( zeta_i[j] != 0.0) {
	beta_i[j] = beta_i[j_low] * zeta_ip1[j] / zeta_i[j];
      } else  {
	zeta_ip1[j] = 0.0;
	beta_i[j] = 0.0;
	finished[j] = 1;
	if (invParam.verbosity >= QUDA_VERBOSE)
	  printfQuda("SETTING A ZERO, j=%d, num_offset_now=%d\n",j,num_offset_now);
	//if(j==num_offset_now-1)node0_PRINTF("REDUCING OFFSET\n");
	if(j==num_offset_now-1) num_offset_now--;
	// don't work any more on finished solutions
	// this only works if largest offsets are last, otherwise
	// just wastes time multiplying by zero
      }
    }	
	
    r2_old = r2;
    r2 = axpyNormCuda(-beta_i[j_low], *Ap, *r_sloppy);

    alpha[0] = r2 / r2_old;
	
    for (j=1; j<num_offset_now; j++) {
      /*THISBLOWSUP
	alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
	(zeta_i[j] * beta_i[j_low]);
      */
      /*TRYTHIS*/
      if( zeta_i[j] * beta_i[j_low] != 0.0)
	alpha[j] = alpha[j_low] * zeta_ip1[j] * beta_i[j] /
	  (zeta_i[j] * beta_i[j_low]);
      else {
	alpha[j] = 0.0;
	finished[j] = 1;
      }
    }
	
    axpyZpbxCuda(beta_i[0], *p[0], *x_sloppy[0], *r_sloppy, alpha[0]);	
    for (j=1; j<num_offset_now; j++) {
      axpyBzpcxCuda(beta_i[j], *p[j], *x_sloppy[j], zeta_ip1[j], *r_sloppy, alpha[j]);
    }
    
    for (j=0; j<num_offset_now; j++) {
      beta_im1[j] = beta_i[j];
      zeta_im1[j] = zeta_i[j];
      zeta_i[j] = zeta_ip1[j];
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE){
      printfQuda("Multimass CG: %d iterations, r2 = %e\n", k, r2);
    }
  }
    
  if (x[0]->Precision() != x_sloppy[0]->Precision()) {
    for(i=0;i < num_offset; i++){
      copyCuda(*x[i], *x_sloppy[i]);
    }
  }

  *residue_sq = r2;

  invParam.secs = stopwatchReadSeconds();
     
  if (k==invParam.maxiter) {
    warningQuda("Exceeded maximum iterations %d\n", invParam.maxiter);
  }
    
  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  invParam.gflops = gflops;
  invParam.iter = k;
  
  // Calculate the true residual of the system with the smallest shift
  mat(*r, *x[0]); 
  axpyCuda(offset[0],*x[0], *r); // Offset it.

  double true_res = xmyNormCuda(b, *r);
  if (invParam.verbosity >= QUDA_SUMMARIZE){
    printfQuda("MultiShift CG: Converged after %d iterations, r2 = %e, relative true_r2 = %e\n", 
	       k,r2, (true_res / b2));
  }    
  if (invParam.verbosity >= QUDA_VERBOSE){
    printfQuda("MultiShift CG: Converged after %d iterations\n", k);
    printfQuda(" shift=0 resid_rel=%e\n", sqrt(true_res/b2));
    for(int i=1; i < num_offset; i++) { 
      mat(*r, *x[i]); 
      axpyCuda(offset[i],*x[i], *r); // Offset it.
      true_res = xmyNormCuda(b, *r);
      printfQuda(" shift=%d resid_rel=%e\n",i, sqrt(true_res/b2));
    }
  }      
  
  delete r;
  for(i=0;i < num_offset; i++){
    delete p[i];
  }
  delete p;
  delete Ap;
  
  if (invParam.cuda_prec_sloppy != x[0]->Precision()) {
    for(i=0;i < num_offset;i++){
      delete x_sloppy[i];
    }
    delete r_sloppy;
  }
  delete x_sloppy;
  
  delete []finished;
  delete []zeta_i;
  delete []zeta_im1;
  delete []zeta_ip1;
  delete []beta_i;
  delete []beta_im1;
  delete []alpha;
 
}
Exemplo n.º 14
0
  void MultiShiftCG::operator()(cudaColorSpinorField **x, cudaColorSpinorField &b)
  {
    profile.Start(QUDA_PROFILE_INIT);

    int num_offset = param.num_offset;
    double *offset = param.offset;
 
    if (num_offset == 0) return;

    const double b2 = normCuda(b);
    // Check to see that we're not trying to invert on a zero-field source
    if(b2 == 0){
      profile.Stop(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      for(int i=0; i<num_offset; ++i){
        *(x[i]) = b;
	param.true_res_offset[i] = 0.0;
	param.true_res_hq_offset[i] = 0.0;
      }
      return;
    }
    

    double *zeta = new double[num_offset];
    double *zeta_old = new double[num_offset];
    double *alpha = new double[num_offset];
    double *beta = new double[num_offset];
  
    int j_low = 0;   
    int num_offset_now = num_offset;
    for (int i=0; i<num_offset; i++) {
      zeta[i] = zeta_old[i] = 1.0;
      beta[i] = 0.0;
      alpha[i] = 1.0;
    }
  
    // flag whether we will be using reliable updates or not
    bool reliable = false;
    for (int j=0; j<num_offset; j++) 
      if (param.tol_offset[j] < param.delta) reliable = true;


    cudaColorSpinorField *r = new cudaColorSpinorField(b);
    cudaColorSpinorField **y = reliable ? new cudaColorSpinorField*[num_offset] : NULL;
  
    ColorSpinorParam csParam(b);
    csParam.create = QUDA_ZERO_FIELD_CREATE;

    if (reliable)
      for (int i=0; i<num_offset; i++) y[i] = new cudaColorSpinorField(*r, csParam);

    csParam.setPrecision(param.precision_sloppy);
  
    cudaColorSpinorField *r_sloppy;
    if (param.precision_sloppy == x[0]->Precision()) {
      r_sloppy = r;
    } else {
      csParam.create = QUDA_COPY_FIELD_CREATE;
      r_sloppy = new cudaColorSpinorField(*r, csParam);
    }
  
    cudaColorSpinorField **x_sloppy = new cudaColorSpinorField*[num_offset];
    if (param.precision_sloppy == x[0]->Precision() ||
	!param.use_sloppy_partial_accumulator) {
      for (int i=0; i<num_offset; i++) x_sloppy[i] = x[i];
    } else {
      csParam.create = QUDA_ZERO_FIELD_CREATE;
      for (int i=0; i<num_offset; i++)
	x_sloppy[i] = new cudaColorSpinorField(*x[i], csParam);
    }
  
    cudaColorSpinorField **p = new cudaColorSpinorField*[num_offset];  
    for (int i=0; i<num_offset; i++) p[i]= new cudaColorSpinorField(*r_sloppy);    
  
    csParam.create = QUDA_ZERO_FIELD_CREATE;
    cudaColorSpinorField* Ap = new cudaColorSpinorField(*r_sloppy, csParam);
  
    cudaColorSpinorField tmp1(*Ap, csParam);

    // tmp2 only needed for multi-gpu Wilson-like kernels
    cudaColorSpinorField *tmp2_p = !mat.isStaggered() ?
      new cudaColorSpinorField(*Ap, csParam) : &tmp1;
    cudaColorSpinorField &tmp2 = *tmp2_p;

    // additional high-precision temporary if Wilson and mixed-precision
    csParam.setPrecision(param.precision);
    cudaColorSpinorField *tmp3_p =
      (param.precision != param.precision_sloppy && !mat.isStaggered()) ?
      new cudaColorSpinorField(*r, csParam) : &tmp1;
    cudaColorSpinorField &tmp3 = *tmp3_p;

    profile.Stop(QUDA_PROFILE_INIT);
    profile.Start(QUDA_PROFILE_PREAMBLE);

    // stopping condition of each shift
    double stop[QUDA_MAX_MULTI_SHIFT];
    double r2[QUDA_MAX_MULTI_SHIFT];
    for (int i=0; i<num_offset; i++) {
      r2[i] = b2;
      stop[i] = Solver::stopping(param.tol_offset[i], b2, param.residual_type);
    }

    double r2_old;
    double pAp;

    double rNorm[QUDA_MAX_MULTI_SHIFT];
    double r0Norm[QUDA_MAX_MULTI_SHIFT];
    double maxrx[QUDA_MAX_MULTI_SHIFT];
    double maxrr[QUDA_MAX_MULTI_SHIFT];
    for (int i=0; i<num_offset; i++) {
      rNorm[i] = sqrt(r2[i]);
      r0Norm[i] = rNorm[i];
      maxrx[i] = rNorm[i];
      maxrr[i] = rNorm[i];
    }
    double delta = param.delta;

    // this parameter determines how many consective reliable update
    // reisudal increases we tolerate before terminating the solver,
    // i.e., how long do we want to keep trying to converge
    const int maxResIncrease =  param.max_res_increase; // check if we reached the limit of our tolerance
    const int maxResIncreaseTotal = param.max_res_increase_total;
    
    int resIncrease = 0;
    int resIncreaseTotal[QUDA_MAX_MULTI_SHIFT];
    for (int i=0; i<num_offset; i++) {
      resIncreaseTotal[i]=0;
    }

    int k = 0;
    int rUpdate = 0;
    quda::blas_flops = 0;

    profile.Stop(QUDA_PROFILE_PREAMBLE);
    profile.Start(QUDA_PROFILE_COMPUTE);

    if (getVerbosity() >= QUDA_VERBOSE) 
      printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
    
    while (r2[0] > stop[0] &&  k < param.maxiter) {
      matSloppy(*Ap, *p[0], tmp1, tmp2);
      // FIXME - this should be curried into the Dirac operator
      if (r->Nspin()==4) axpyCuda(offset[0], *p[0], *Ap); 

      pAp = reDotProductCuda(*p[0], *Ap);

      // compute zeta and alpha
      updateAlphaZeta(alpha, zeta, zeta_old, r2, beta, pAp, offset, num_offset_now, j_low);
	
      r2_old = r2[0];
      Complex cg_norm = axpyCGNormCuda(-alpha[j_low], *Ap, *r_sloppy);
      r2[0] = real(cg_norm);
      double zn = imag(cg_norm);

      // reliable update conditions
      rNorm[0] = sqrt(r2[0]);
      for (int j=1; j<num_offset_now; j++) rNorm[j] = rNorm[0] * zeta[j];

      int updateX=0, updateR=0;
      int reliable_shift = -1; // this is the shift that sets the reliable_shift
      for (int j=num_offset_now-1; j>=0; j--) {
	if (rNorm[j] > maxrx[j]) maxrx[j] = rNorm[j];
	if (rNorm[j] > maxrr[j]) maxrr[j] = rNorm[j];
	updateX = (rNorm[j] < delta*r0Norm[j] && r0Norm[j] <= maxrx[j]) ? 1 : updateX;
	updateR = ((rNorm[j] < delta*maxrr[j] && r0Norm[j] <= maxrr[j]) || updateX) ? 1 : updateR;
	if ((updateX || updateR) && reliable_shift == -1) reliable_shift = j;
      }

      if ( !(updateR || updateX) || !reliable) {
	//beta[0] = r2[0] / r2_old;	
	beta[0] = zn / r2_old;
	// update p[0] and x[0]
	axpyZpbxCuda(alpha[0], *p[0], *x_sloppy[0], *r_sloppy, beta[0]);	

	for (int j=1; j<num_offset_now; j++) {
	  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
	  // update p[i] and x[i]
	  axpyBzpcxCuda(alpha[j], *p[j], *x_sloppy[j], zeta[j], *r_sloppy, beta[j]);
	}
      } else {
	for (int j=0; j<num_offset_now; j++) {
	  axpyCuda(alpha[j], *p[j], *x_sloppy[j]);
	  copyCuda(*x[j], *x_sloppy[j]);
	  xpyCuda(*x[j], *y[j]);
	}

	mat(*r, *y[0], *x[0], tmp3); // here we can use x as tmp
	if (r->Nspin()==4) axpyCuda(offset[0], *y[0], *r);

	r2[0] = xmyNormCuda(b, *r);
	for (int j=1; j<num_offset_now; j++) r2[j] = zeta[j] * zeta[j] * r2[0];
	for (int j=0; j<num_offset_now; j++) zeroCuda(*x_sloppy[j]);

	copyCuda(*r_sloppy, *r);            

	// break-out check if we have reached the limit of the precision

	if (sqrt(r2[reliable_shift]) > r0Norm[reliable_shift]) { // reuse r0Norm for this
    resIncrease++;
    resIncreaseTotal[reliable_shift]++;
	  warningQuda("MultiShiftCG: Shift %d, updated residual %e is greater than previous residual %e (total #inc %i)", 
		      reliable_shift, sqrt(r2[reliable_shift]), r0Norm[reliable_shift], resIncreaseTotal[reliable_shift]);


	  if (resIncrease > maxResIncrease or resIncreaseTotal[reliable_shift] > maxResIncreaseTotal) break; // check if we reached the limit of our tolerancebreak;
	} else {
	  resIncrease = 0;
	}

	// explicitly restore the orthogonality of the gradient vector
	for (int j=0; j<num_offset_now; j++) {
	  double rp = reDotProductCuda(*r_sloppy, *p[j]) / (r2[0]);
	  axpyCuda(-rp, *r_sloppy, *p[j]);
	}

	// update beta and p
	beta[0] = r2[0] / r2_old; 
	xpayCuda(*r_sloppy, beta[0], *p[0]);
	for (int j=1; j<num_offset_now; j++) {
	  beta[j] = beta[j_low] * zeta[j] * alpha[j] / (zeta_old[j] * alpha[j_low]);
	  axpbyCuda(zeta[j], *r_sloppy, beta[j], *p[j]);
	}    

	// update reliable update parameters for the system that triggered the update
	int m = reliable_shift;
	rNorm[m] = sqrt(r2[0]) * zeta[m];
	maxrr[m] = rNorm[m];
	maxrx[m] = rNorm[m];
	r0Norm[m] = rNorm[m];      
	rUpdate++;
      }    

      // now we can check if any of the shifts have converged and remove them
      for (int j=1; j<num_offset_now; j++) {
        if (zeta[j] == 0.0) {
          num_offset_now--;
          if (getVerbosity() >= QUDA_VERBOSE)
              printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k + 1);
        }
        else {
	r2[j] = zeta[j] * zeta[j] * r2[0];
	if (r2[j] < stop[j]) {
            num_offset_now--;
	  if (getVerbosity() >= QUDA_VERBOSE)
	    printfQuda("MultiShift CG: Shift %d converged after %d iterations\n", j, k+1);
          }
	}
      }

      k++;
      
      if (getVerbosity() >= QUDA_VERBOSE) 
	printfQuda("MultiShift CG: %d iterations, <r,r> = %e, |r|/|b| = %e\n", k, r2[0], sqrt(r2[0]/b2));
    }
    
    
    for (int i=0; i<num_offset; i++) {
      copyCuda(*x[i], *x_sloppy[i]);
      if (reliable) xpyCuda(*y[i], *x[i]);
    }

    profile.Stop(QUDA_PROFILE_COMPUTE);
    profile.Start(QUDA_PROFILE_EPILOGUE);

    if (getVerbosity() >= QUDA_VERBOSE)
      printfQuda("MultiShift CG: Reliable updates = %d\n", rUpdate);

    if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d\n", param.maxiter);
    
    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
    double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
    reduceDouble(gflops);
    param.gflops = gflops;
    param.iter += k;

    for(int i=0; i < num_offset; i++) { 
      mat(*r, *x[i]); 
      if (r->Nspin()==4) {
	axpyCuda(offset[i], *x[i], *r); // Offset it.
      } else if (i!=0) {
	axpyCuda(offset[i]-offset[0], *x[i], *r); // Offset it.
      }
      double true_res = xmyNormCuda(b, *r);
      param.true_res_offset[i] = sqrt(true_res/b2);
#if (__COMPUTE_CAPABILITY__ >= 200)
      param.true_res_hq_offset[i] = sqrt(HeavyQuarkResidualNormCuda(*x[i], *r).z);
#else
      param.true_res_hq_offset[i] = 0.0;
#endif   
    }

    if (getVerbosity() >= QUDA_SUMMARIZE){
      printfQuda("MultiShift CG: Converged after %d iterations\n", k);
      for(int i=0; i < num_offset; i++) { 
	printfQuda(" shift=%d, relative residual: iterated = %e, true = %e\n", 
		   i, sqrt(r2[i]/b2), param.true_res_offset[i]);
      }
    }      
  
    // reset the flops counters
    quda::blas_flops = 0;
    mat.flops();
    matSloppy.flops();

    profile.Stop(QUDA_PROFILE_EPILOGUE);
    profile.Start(QUDA_PROFILE_FREE);

    if (&tmp3 != &tmp1) delete tmp3_p;
    if (&tmp2 != &tmp1) delete tmp2_p;

    if (r_sloppy->Precision() != r->Precision()) delete r_sloppy;
    for (int i=0; i<num_offset; i++) 
       if (x_sloppy[i]->Precision() != x[i]->Precision()) delete x_sloppy[i];
    delete []x_sloppy;
  
    delete r;
    for (int i=0; i<num_offset; i++) delete p[i];
    delete []p;

    if (reliable) {
      for (int i=0; i<num_offset; i++) delete y[i];
      delete []y;
    }

    delete Ap;
  
    delete []zeta_old;
    delete []zeta;
    delete []alpha;
    delete []beta;

    profile.Stop(QUDA_PROFILE_FREE);

    return;
  }
Exemplo n.º 15
0
  void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
  {
    profile.Start(QUDA_PROFILE_INIT);

    // Check to see that we're not trying to invert on a zero-field source    
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.Stop(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
      return;
    }


    cudaColorSpinorField r(b);

    ColorSpinorParam csParam(x);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
    cudaColorSpinorField y(b, csParam); 
  
    mat(r, x, y);
//    zeroCuda(y);

    double r2 = xmyNormCuda(b, r);
  
    csParam.setPrecision(param.precision_sloppy);
    cudaColorSpinorField Ap(x, csParam);
    cudaColorSpinorField tmp(x, csParam);

    cudaColorSpinorField *tmp2_p = &tmp;
    // tmp only needed for multi-gpu Wilson-like kernels
    if (mat.Type() != typeid(DiracStaggeredPC).name() && 
	mat.Type() != typeid(DiracStaggered).name()) {
      tmp2_p = new cudaColorSpinorField(x, csParam);
    }
    cudaColorSpinorField &tmp2 = *tmp2_p;

    cudaColorSpinorField *x_sloppy, *r_sloppy;
    if (param.precision_sloppy == x.Precision()) {
      csParam.create = QUDA_REFERENCE_FIELD_CREATE;
      x_sloppy = &x;
      r_sloppy = &r;
    } else {
      csParam.create = QUDA_COPY_FIELD_CREATE;
      x_sloppy = new cudaColorSpinorField(x, csParam);
      r_sloppy = new cudaColorSpinorField(r, csParam);
    }

    cudaColorSpinorField &xSloppy = *x_sloppy;
    cudaColorSpinorField &rSloppy = *r_sloppy;
    cudaColorSpinorField p(rSloppy);

    if(&x != &xSloppy){
      copyCuda(y,x);
      zeroCuda(xSloppy);
    }else{
      zeroCuda(y);
    }
    
    const bool use_heavy_quark_res = 
      (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;
    
    profile.Stop(QUDA_PROFILE_INIT);
    profile.Start(QUDA_PROFILE_PREAMBLE);

    double r2_old;
    double stop = b2*param.tol*param.tol; // stopping condition of solver

    double heavy_quark_res = 0.0; // heavy quark residual
    if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
    int heavy_quark_check = 10; // how often to check the heavy quark residual

    double alpha=0.0, beta=0.0;
    double pAp;
    int rUpdate = 0;

    double rNorm = sqrt(r2);
    double r0Norm = rNorm;
    double maxrx = rNorm;
    double maxrr = rNorm;
    double delta = param.delta;

    // this parameter determines how many consective reliable update
    // reisudal increases we tolerate before terminating the solver,
    // i.e., how long do we want to keep trying to converge
    int maxResIncrease = 0; // 0 means we have no tolerance 

    profile.Stop(QUDA_PROFILE_PREAMBLE);
    profile.Start(QUDA_PROFILE_COMPUTE);
    blas_flops = 0;

    int k=0;
    
    PrintStats("CG", k, r2, b2, heavy_quark_res);

    int steps_since_reliable = 1;

    while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && 
	    k < param.maxiter) {
      matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
      double sigma;

      bool breakdown = false;

      if (param.pipeline) {
	double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p);
	r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z;
	r2_old = r2;

	alpha = r2 / pAp;        
	sigma = alpha*(alpha * Ap2 - pAp);
	if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down
	  r2 = axpyNormCuda(-alpha, Ap, rSloppy);
	  sigma = r2;
	  breakdown = true;
	}

	r2 = sigma;
      } else {
	r2_old = r2;
	pAp = reDotProductCuda(p, Ap);
	alpha = r2 / pAp;        

	// here we are deploying the alternative beta computation 
	Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy);
	r2 = real(cg_norm); // (r_new, r_new)
	sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks
      }

      // reliable update conditions
      rNorm = sqrt(r2);
      if (rNorm > maxrx) maxrx = rNorm;
      if (rNorm > maxrr) maxrr = rNorm;
      int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
      int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
      // force a reliable update if we are within target tolerance (only if doing reliable updates)
      if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1;

      if ( !(updateR || updateX)) {
	//beta = r2 / r2_old;
	beta = sigma / r2_old; // use the alternative beta computation

	if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, rSloppy, xSloppy, p);
	else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);

	if (use_heavy_quark_res && k%heavy_quark_check==0) { 
	  copyCuda(tmp,y);
	  heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z);
	}

	steps_since_reliable++;
      } else {
	axpyCuda(alpha, p, xSloppy);
	if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
	xpyCuda(x, y); // swap these around?
	mat(r, y, x); // here we can use x as tmp
	r2 = xmyNormCuda(b, r);

	if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
	zeroCuda(xSloppy);

	// break-out check if we have reached the limit of the precision
	static int resIncrease = 0;
	if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this
	  warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm);
	  k++;
	  rUpdate++;
	  if (++resIncrease > maxResIncrease) break; 
	} else {
	  resIncrease = 0;
	}

	rNorm = sqrt(r2);
	maxrr = rNorm;
	maxrx = rNorm;
	r0Norm = rNorm;      
	rUpdate++;

	// explicitly restore the orthogonality of the gradient vector
	double rp = reDotProductCuda(rSloppy, p) / (r2);
	axpyCuda(-rp, rSloppy, p);

	beta = r2 / r2_old; 
	xpayCuda(rSloppy, beta, p);

	if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y,r).z);
	
	steps_since_reliable = 0;
      }

      breakdown = false;
      k++;

      PrintStats("CG", k, r2, b2, heavy_quark_res);
    }

    if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
    xpyCuda(y, x);

    profile.Stop(QUDA_PROFILE_COMPUTE);
    profile.Start(QUDA_PROFILE_EPILOGUE);

    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
    double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
    reduceDouble(gflops);
      param.gflops = gflops;
    param.iter += k;

    if (k==param.maxiter) 
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    if (getVerbosity() >= QUDA_VERBOSE)
      printfQuda("CG: Reliable updates = %d\n", rUpdate);

    // compute the true residuals
    mat(r, x, y);
    param.true_res = sqrt(xmyNormCuda(b, r) / b2);
#if (__COMPUTE_CAPABILITY__ >= 200)
    param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
#else
    param.true_res_hq = 0.0;
#endif      

    PrintSummary("CG", k, r2, b2);

    // reset the flops counters
    quda::blas_flops = 0;
    mat.flops();
    matSloppy.flops();

    profile.Stop(QUDA_PROFILE_EPILOGUE);
    profile.Start(QUDA_PROFILE_FREE);

    if (&tmp2 != &tmp) delete tmp2_p;

    if (param.precision_sloppy != x.Precision()) {
      delete r_sloppy;
      delete x_sloppy;
    }

    profile.Stop(QUDA_PROFILE_FREE);

    return;
  }
Exemplo n.º 16
0
  void PreconCG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b)
  {

    profile.Start(QUDA_PROFILE_INIT);
    // Check to see that we're not trying to invert on a zero-field source
    const double b2 = norm2(b);
    if(b2 == 0){
      profile.Stop(QUDA_PROFILE_INIT);
      printfQuda("Warning: inverting on zero-field source\n");
      x=b;
      param.true_res = 0.0;
      param.true_res_hq = 0.0;
    }

    int k=0;
    int rUpdate=0;

    cudaColorSpinorField* minvrPre;
    cudaColorSpinorField* rPre;
    cudaColorSpinorField* minvr;
    cudaColorSpinorField* minvrSloppy;
    cudaColorSpinorField* p;


    ColorSpinorParam csParam(b);
    cudaColorSpinorField r(b);
    if(K) minvr = new cudaColorSpinorField(b);
    csParam.create = QUDA_ZERO_FIELD_CREATE;
    cudaColorSpinorField y(b,csParam);

    mat(r, x, y); // => r = A*x;
    double r2 = xmyNormCuda(b,r);

    csParam.setPrecision(param.precision_sloppy);
    cudaColorSpinorField tmpSloppy(x,csParam);
    cudaColorSpinorField Ap(x,csParam);

    cudaColorSpinorField *r_sloppy;
    if(param.precision_sloppy == x.Precision())
    {
      r_sloppy = &r;
      minvrSloppy = minvr;
    }else{
      csParam.create = QUDA_COPY_FIELD_CREATE;
      r_sloppy = new cudaColorSpinorField(r,csParam);
      if(K) minvrSloppy = new cudaColorSpinorField(*minvr,csParam);
    }
  

    cudaColorSpinorField *x_sloppy;
    if(param.precision_sloppy == x.Precision() ||
        !param.use_sloppy_partial_accumulator) {
      csParam.create = QUDA_REFERENCE_FIELD_CREATE;
      x_sloppy = &x;
    }else{
      csParam.create = QUDA_COPY_FIELD_CREATE;
      x_sloppy = new cudaColorSpinorField(x,csParam);
    }


    cudaColorSpinorField &xSloppy = *x_sloppy;
    cudaColorSpinorField &rSloppy = *r_sloppy;

    if(&x != &xSloppy){
      copyCuda(y, x); // copy x to y
      zeroCuda(xSloppy);
    }else{
      zeroCuda(y); // no reliable updates // NB: check this
    }

    const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false;

    if(K){
      csParam.create = QUDA_COPY_FIELD_CREATE;
      csParam.setPrecision(param.precision_precondition);
      rPre = new cudaColorSpinorField(rSloppy,csParam);
      // Create minvrPre 
      minvrPre = new cudaColorSpinorField(*rPre);
      globalReduce = false;
      (*K)(*minvrPre, *rPre);  
      globalReduce = true;
      *minvrSloppy = *minvrPre;
      p = new cudaColorSpinorField(*minvrSloppy);
    }else{
      p = new cudaColorSpinorField(rSloppy);
    }

  
    profile.Stop(QUDA_PROFILE_INIT);


    profile.Start(QUDA_PROFILE_PREAMBLE);



    double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver
    double heavy_quark_res = 0.0; // heavy quark residual 
    if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z);
    int heavy_quark_check = 10; // how often to check the heavy quark residual


    double alpha = 0.0, beta=0.0;
    double pAp;
    double rMinvr  = 0;
    double rMinvr_old = 0.0;
    double r_new_Minvr_old = 0.0;
    double r2_old = 0;
    r2 = norm2(r);

    double rNorm = sqrt(r2);
    double r0Norm = rNorm;
    double maxrx = rNorm;
    double maxrr = rNorm;
    double delta = param.delta;


    if(K) rMinvr = reDotProductCuda(rSloppy,*minvrSloppy);

    profile.Stop(QUDA_PROFILE_PREAMBLE);
    profile.Start(QUDA_PROFILE_COMPUTE);


    quda::blas_flops = 0;

    int steps_since_reliable = 1;

    const int maxResIncrease = 0;

    while(!convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter){

      matSloppy(Ap, *p, tmpSloppy);

      double sigma;
      bool breakdown = false;
      pAp   = reDotProductCuda(*p,Ap);

      alpha = (K) ? rMinvr/pAp : r2/pAp;
      Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy); 
      // r --> r - alpha*A*p
      r2_old = r2;
      r2 = real(cg_norm);
  
      sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k-1 - r_k) breaks

      if(K) rMinvr_old = rMinvr;

      rNorm = sqrt(r2);
      if(rNorm > maxrx) maxrx = rNorm;
      if(rNorm > maxrr) maxrr = rNorm;


      int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
      int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;

  
      // force a reliable update if we are within target tolerance (only if doing reliable updates)
      if( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1;
    

      if( !(updateR || updateX) ){

        if(K){
          r_new_Minvr_old = reDotProductCuda(rSloppy,*minvrSloppy);
          *rPre = rSloppy;
          globalReduce = false;
          (*K)(*minvrPre, *rPre);
          globalReduce = true;
      

          *minvrSloppy = *minvrPre;

          rMinvr = reDotProductCuda(rSloppy,*minvrSloppy);
          beta = (rMinvr - r_new_Minvr_old)/rMinvr_old; 
          axpyZpbxCuda(alpha, *p, xSloppy, *minvrSloppy, beta);
        }else{
          beta = sigma/r2_old; // use the alternative beta computation
          axpyZpbxCuda(alpha, *p, xSloppy, rSloppy, beta);
        }
      } else { // reliable update

        axpyCuda(alpha, *p, xSloppy); // xSloppy += alpha*p
        copyCuda(x, xSloppy);
        xpyCuda(x, y); // y += x
        // Now compute r 
        mat(r, y, x); // x is just a temporary here
        r2 = xmyNormCuda(b, r);
        copyCuda(rSloppy, r); // copy r to rSloppy
        zeroCuda(xSloppy);


        // break-out check if we have reached the limit of the precision
        static int resIncrease = 0;
        if(sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this 
          warningQuda("PCG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm);

          k++;
          rUpdate++;
          if(++resIncrease > maxResIncrease) break;
        }else{
          resIncrease = 0;
        }

        rNorm = sqrt(r2);
        maxrr = rNorm;
        maxrx = rNorm;
        r0Norm = rNorm;
        ++rUpdate;

        if(K){
          *rPre = rSloppy;
          globalReduce = false;
          (*K)(*minvrPre, *rPre);
          globalReduce = true;

          *minvrSloppy = *minvrPre;

          rMinvr = reDotProductCuda(rSloppy,*minvrSloppy);
          beta = rMinvr/rMinvr_old;        

          xpayCuda(*minvrSloppy, beta, *p); // p = minvrSloppy + beta*p
        }else{ // standard CG - no preconditioning

          // explicitly restore the orthogonality of the gradient vector
          double rp = reDotProductCuda(rSloppy, *p)/(r2);
          axpyCuda(-rp, rSloppy, *p);

          beta = r2/r2_old;
          xpayCuda(rSloppy, beta, *p);

          steps_since_reliable = 0;
        }
      }      
      breakdown = false;
      ++k;
      PrintStats("PCG", k, r2, b2, heavy_quark_res);
    }


    profile.Stop(QUDA_PROFILE_COMPUTE);

    profile.Start(QUDA_PROFILE_EPILOGUE);

    if(x.Precision() != param.precision_sloppy) copyCuda(x, xSloppy);
    xpyCuda(y, x); // x += y


    param.secs = profile.Last(QUDA_PROFILE_COMPUTE);
    double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9;
    reduceDouble(gflops);
    param.gflops = gflops;
    param.iter += k;

    if (k==param.maxiter)
      warningQuda("Exceeded maximum iterations %d", param.maxiter);

    if (getVerbosity() >= QUDA_VERBOSE)
      printfQuda("CG: Reliable updates = %d\n", rUpdate);





    // compute the true residual 
    mat(r, x, y);
    double true_res = xmyNormCuda(b, r);
    param.true_res = sqrt(true_res / b2);

    // reset the flops counters
    quda::blas_flops = 0;
    mat.flops();
    matSloppy.flops();
    matPrecon.flops();

    profile.Stop(QUDA_PROFILE_EPILOGUE);
    profile.Start(QUDA_PROFILE_FREE);

    if(K){ // These are only needed if preconditioning is used
      delete minvrPre;
      delete rPre;
      delete minvr;
      if(x.Precision() != param.precision_sloppy)  delete minvrSloppy;
    }
    delete p;

    if(x.Precision() != param.precision_sloppy){
      delete x_sloppy;
      delete r_sloppy;
    }

    profile.Stop(QUDA_PROFILE_FREE);
    return;
  }