Ejemplo n.º 1
0
  void DeflatedSolver::PrintSummary(const char *name, int k, const double &r2, const double &b2) {
    if (getVerbosity() >= QUDA_SUMMARIZE) {
      if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
	printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %e, true = %e, heavy-quark residual = %e\n", name, k, sqrt(r2/b2), param.true_res, param.true_res_hq);    
      } else {
	printfQuda("%s: Convergence at %d iterations, L2 relative residual: iterated = %e, true = %e\n", 
		   name, k, sqrt(r2/b2), param.true_res);
      }

    }
  }
Ejemplo n.º 2
0
void
display_test_info()
{
  printfQuda("running the following test:\n");
    
  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon test_type  S_dimension T_dimension\n");
  printfQuda("%s   %s             %s            %s            %s         %d          %d \n",
	 get_prec_str(prec),get_prec_str(prec_sloppy),
	 get_recon_str(link_recon), 
	 get_recon_str(link_recon_sloppy), get_test_type(testtype), sdim, tdim);     
  return ;
  
}
Ejemplo n.º 3
0
  void DeflatedSolver::PrintStats(const char* name, int k, const double &r2, 
			  const double &b2, const double &hq2) {
    if (getVerbosity() >= QUDA_VERBOSE) {
      if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) {
	printfQuda("%s: %d iterations, <r,r> = %e, |r|/|b| = %e, heavy-quark residual = %e\n", 
		   name, k, r2, sqrt(r2/b2), hq2);
      } else {
	printfQuda("%s: %d iterations, <r,r> = %e, |r|/|b| = %e\n", 
		   name, k, r2, sqrt(r2/b2));
      }
    }

    if (std::isnan(r2)) errorQuda("Solver appears to have diverged");
  }
Ejemplo n.º 4
0
int main(int argc, char **argv)
{
  init();

  float spinorGiB = (float)Vh*Ls*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
  
  int attempts = 1;
  dslashRef();

  for (int i=0; i<attempts; i++) {
    
    if (tune) { // warm-up run
      printfQuda("Tuning...\n");
      setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);      
      dslashCUDA(1);
    }

    double secs = dslashCUDA();

    if (!transfer) *spinorOut = *cudaSpinorOut;

    // print timing information
    printf("%fms per loop\n", 1000*secs);
    
    unsigned long long flops = 0;
    if (!transfer) flops = dirac->Flops();

    int spinor_floats = test_type ? 2*(9*24+24)+24 : 9*24+24;
    if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 
      spinor_floats += test_type ? 2*(9*2 + 2) + 2 : 9*2 + 2; // relative size of norm is twice a short
    int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;

    printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
    printfQuda("GB/s = %f\n\n", 
	       (float)Vh*Ls*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9));


    if (!transfer) {
      std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 
	", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
    } else {
      std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
    }
    
    cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
  }    
  end();
}
Ejemplo n.º 5
0
  // Resets the attributes of this field if param disagrees (and is defined)
  void ColorSpinorField::reset(const ColorSpinorParam &param) {

    if (param.nColor != 0) nColor = param.nColor;
    if (param.nSpin != 0) nSpin = param.nSpin;
    if (param.twistFlavor != QUDA_TWIST_INVALID) twistFlavor = param.twistFlavor;

    if (param.precision != QUDA_INVALID_PRECISION)  precision = param.precision;
    if (param.nDim != 0) nDim = param.nDim;

    volume = 1;
    for (int d=0; d<nDim; d++) {
      if (param.x[0] != 0) x[d] = param.x[d];
      volume *= x[d];
    }
  
    if (param.pad != 0) pad = param.pad;

    if (param.siteSubset == QUDA_FULL_SITE_SUBSET){
      stride = volume/2 + pad;
      length = 2*stride*nColor*nSpin*2;
    } else if (param.siteSubset == QUDA_PARITY_SITE_SUBSET){
      stride = volume + pad;
      length = stride*nColor*nSpin*2;  
    } else {
      //errorQuda("SiteSubset not defined %d", param.siteSubset);
      //do nothing, not an error (can't remember why - need to document this sometime! )
    }

    if (param.siteSubset != QUDA_INVALID_SITE_SUBSET) siteSubset = param.siteSubset;
    if (param.siteOrder != QUDA_INVALID_SITE_ORDER) siteOrder = param.siteOrder;
    if (param.fieldOrder != QUDA_INVALID_FIELD_ORDER) fieldOrder = param.fieldOrder;
    if (param.gammaBasis != QUDA_INVALID_GAMMA_BASIS) gammaBasis = param.gammaBasis;

    createGhostZone();

    real_length = volume*nColor*nSpin*2;

    bytes = total_length * precision;
    bytes = ALIGNMENT_ADJUST(bytes);
    norm_bytes = total_norm_length * sizeof(float);
    norm_bytes = ALIGNMENT_ADJUST(norm_bytes);
    if (!init) errorQuda("Shouldn't be resetting a non-inited field\n");

    if (verbose >= QUDA_DEBUG_VERBOSE) {
      printfQuda("\nPrinting out reset field\n");
      std::cout << *this << std::endl;
      printfQuda("\n");
    }
  }
Ejemplo n.º 6
0
int main(int argc, char **argv) 
{

  int i;
  for (i =1;i < argc; i++){
    
    if(process_command_line_option(argc, argv, &i) == 0){
      continue;
    }    
    
    fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
    usage(argv);
  }
  
  initCommsQuda(argc, argv, gridsize_from_cmdline, 4);
  
  display_test_info();

  int ret =1;
  int accuracy_level = dslashTest();

  printfQuda("accuracy_level =%d\n", accuracy_level);

  if (accuracy_level >= 1) ret = 0;    //probably no error, -1 means no matching  
  endCommsQuda();
  return ret;
}
Ejemplo n.º 7
0
void staggeredDslashRef()
{
#ifndef MULTI_GPU
  int cpu_parity = 0;
#endif

  // compare to dslash reference implementation
  printfQuda("Calculating reference implementation...");
  fflush(stdout);
  switch (test_type) {
  case 0:    
#ifdef MULTI_GPU

    staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, 
			    spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec);
#else
    cpu_parity = 0; //EVEN
    staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, 
		     inv_param.cpu_prec, gaugeParam.cpu_prec);
    
#endif    


    break;
  case 1: 
#ifdef MULTI_GPU
    staggered_dslash_mg4dir(spinorRef, fatlink, longlink, (void**)ghost_fatlink, (void**)ghost_longlink, 
			    spinor, parity, dagger, inv_param.cpu_prec, gaugeParam.cpu_prec);    
    
#else
    cpu_parity=1; //ODD
    staggered_dslash(spinorRef->V(), fatlink, longlink, spinor->V(), cpu_parity, dagger, 
		     inv_param.cpu_prec, gaugeParam.cpu_prec);
#endif
    break;
  case 2:
    //mat(spinorRef->V(), fatlink, longlink, spinor->V(), kappa, dagger, 
    //inv_param.cpu_prec, gaugeParam.cpu_prec);
    break;
  default:
    errorQuda("Test type not defined");
  }
    
  printfQuda("done.\n");
    
}
Ejemplo n.º 8
0
static int dslashTest() 
{
  int accuracy_level = 0;
  
  init();
    
  int attempts = 1;
    
  for (int i=0; i<attempts; i++) {

    if (tune) { // warm-up run
      printfQuda("Tuning...\n");
      setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);      
      dslashCUDA(1);
    }
    printfQuda("Executing %d kernel loops...", loops);	
    double secs = dslashCUDA(loops);

#ifdef DSLASH_PROFILING
    printDslashProfile();
#endif
    
    if (!transfer) *spinorOut = *cudaSpinorOut;
      
    printfQuda("\n%fms per loop\n", 1000*secs);
    staggeredDslashRef();
	
    unsigned long long flops = dirac->Flops();
    int link_floats = 8*gaugeParam.reconstruct+8*18;
    int spinor_floats = 8*6*2 + 6;
    int link_float_size = prec;
    int spinor_float_size = 0;
    
    link_floats = test_type ? (2*link_floats) : link_floats;
    spinor_floats = test_type ? (2*spinor_floats) : spinor_floats;

    int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size;
    if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4;	

    printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
    printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9));
	
    if (!transfer) {
      double spinor_ref_norm2 = norm2(*spinorRef);
      double cuda_spinor_out_norm2 =  norm2(*cudaSpinorOut);
      double spinor_out_norm2 =  norm2(*spinorOut);
      printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n",  spinor_ref_norm2, cuda_spinor_out_norm2,
		 spinor_out_norm2);
    } else {
      double spinor_ref_norm2 = norm2(*spinorRef);
      double spinor_out_norm2 =  norm2(*spinorOut);
      printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2);
    }
    
    accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut);	
  }
  end();
  
  return accuracy_level;
}
Ejemplo n.º 9
0
void display_test_info()
{
  printfQuda("running the following test:\n");
 
  printfQuda("prec recon   test_type     dagger   S_dim         T_dimension\n");
  printfQuda("%s   %s       %d           %d       %d/%d/%d        %d \n", 
	     get_prec_str(prec), get_recon_str(link_recon), 
	     test_type, dagger, xdim, ydim, zdim, tdim);
  printfQuda("Grid partition info:     X  Y  Z  T\n"); 
  printfQuda("                         %d  %d  %d  %d\n", 
	     commDimPartitioned(0),
	     commDimPartitioned(1),
	     commDimPartitioned(2),
	     commDimPartitioned(3));

  return ;
    
}
Ejemplo n.º 10
0
  /*
   * Read tunecache from disk.
   */
  void loadTuneCache(QudaVerbosity verbosity)
  {
    char *path;
    struct stat pstat;
    std::string cache_path, line, token;
    std::ifstream cache_file;
    std::stringstream ls;

    path = getenv("QUDA_RESOURCE_PATH");
    if (!path) {
      warningQuda("Environment variable QUDA_RESOURCE_PATH is not set.");
      warningQuda("Caching of tuned parameters will be disabled.");
      return;
    } else if (stat(path, &pstat) || !S_ISDIR(pstat.st_mode)) {
      warningQuda("The path \"%s\" specified by QUDA_RESOURCE_PATH does not exist or is not a directory.", path); 
      warningQuda("Caching of tuned parameters will be disabled.");
      return;
    } else {
      resource_path = path;
    }

#ifdef MULTI_GPU
    if (comm_rank() == 0) {
#endif

      cache_path = resource_path;
      cache_path += "/tunecache.tsv";
      cache_file.open(cache_path.c_str());

      if (cache_file) {

	if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
	getline(cache_file, line);
	ls.str(line);
	ls >> token;
	if (token.compare("tunecache")) errorQuda("Bad format in %s", cache_path.c_str());
	ls >> token;
	if (token.compare(quda_version)) errorQuda("Cache file %s does not match current QUDA version", cache_path.c_str());
	ls >> token;
	if (token.compare(quda_hash)) warningQuda("Cache file %s does not match current QUDA build", cache_path.c_str());
      
	if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
	getline(cache_file, line); // eat the blank line
      
	if (!cache_file.good()) errorQuda("Bad format in %s", cache_path.c_str());
	getline(cache_file, line); // eat the description line
      
	deserializeTuneCache(cache_file);
	cache_file.close();      
	initial_cache_size = tunecache.size();

	if (verbosity >= QUDA_SUMMARIZE) {
	  printfQuda("Loaded %d sets of cached parameters from %s\n", static_cast<int>(initial_cache_size), cache_path.c_str());
	}
      
      } else {
Ejemplo n.º 11
0
void
usage_extra(char** argv )
{
    printfQuda("Extra options:\n");
    printfQuda("    --tol  <resid_tol>                       # Set residual tolerance\n");
    printfQuda("    --test <0/1>                             # Test method\n");
    printfQuda("                                                0: Even even spinor CG inverter\n");
    printfQuda("                                                1: Odd odd spinor CG inverter\n");
    printfQuda("                                                3: Even even spinor multishift CG inverter\n");
    printfQuda("                                                4: Odd odd spinor multishift CG inverter\n");
    printfQuda("                                                6: Even even spinor mixed precision multishift CG inverter\n");
    printfQuda("    --cpu_prec <double/single/half>          # Set CPU precision\n");

    return ;
}
Ejemplo n.º 12
0
int compareSpinor(const U &u, const V &v, const int tol) {
  int fail_check = 16*tol;
  int *fail = new int[fail_check];
  for (int f=0; f<fail_check; f++) fail[f] = 0;

  int N = u.Nspin()*u.Ncolor();
  int *iter = new int[N];
  for (int i=0; i<N; i++) iter[i] = 0;

  for (int x=0; x<u.Volume(); x++) {
    for (int s=0; s<u.Nspin(); s++) {
      for (int c=0; c<u.Ncolor(); c++) {
	for (int z=0; z<2; z++) {
	  double diff = fabs(u(x,s,c,z) - v(x,s,c,z));

	  for (int f=0; f<fail_check; f++)
	    if (diff > pow(10.0,-(f+1)/(double)tol)) fail[f]++;

	  int j = s*u.Nspin() + c;
	  if (diff > 1e-3) iter[j]++;
	}
      }
    }
  }

  for (int i=0; i<N; i++) printfQuda("%d fails = %d\n", i, iter[i]);
    
  int accuracy_level =0;
  for (int f=0; f<fail_check; f++) {
    if (fail[f] == 0) accuracy_level = f+1;
  }

  for (int f=0; f<fail_check; f++) {
    printfQuda("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)/(double)tol), 
	       fail[f], u.Volume()*N, fail[f] / (double)(u.Volume()*N));
  }
  
  delete []iter;
  delete []fail;
  
  return accuracy_level;
}
Ejemplo n.º 13
0
// execute kernel
double dslashCUDA() {

  printfQuda("Executing %d kernel loops...\n", loops);
  fflush(stdout);

  if (test_type < 2)
    dirac->Tune(*cudaSpinorOut, *cudaSpinor, *tmp);
  else
    dirac->Tune(cudaSpinorOut->Even(), cudaSpinor->Even(), *tmp);

  cudaEvent_t start, end;
  cudaEventCreate(&start);
  cudaEventRecord(start, 0);
  cudaEventSynchronize(start);

  for (int i = 0; i < loops; i++) {
    switch (test_type) {
    case 0:
      if (transfer) {
	dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity);
      } else {
	dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
      }
      break;
    case 1:
    case 2:
      if (transfer) {
	MatQuda(spinorOut->V(), spinor->V(), &inv_param);
      } else {
	dirac->M(*cudaSpinorOut, *cudaSpinor);
      }
      break;
    }
  }
    
  cudaEventCreate(&end);
  cudaEventRecord(end, 0);
  cudaEventSynchronize(end);
  float runTime;
  cudaEventElapsedTime(&runTime, start, end);
  cudaEventDestroy(start);
  cudaEventDestroy(end);

  double secs = runTime / 1000; //stopwatchReadSeconds();

  // check for errors
  cudaError_t stat = cudaGetLastError();
  if (stat != cudaSuccess)
    printf("with ERROR: %s\n", cudaGetErrorString(stat));

  printf("done.\n\n");

  return secs;
}
Ejemplo n.º 14
0
// execute kernel
double dslashCUDA(int niter) {

  cudaEvent_t start, end;
  cudaEventCreate(&start);
  cudaEventCreate(&end);
  cudaEventRecord(start, 0);

  for (int i = 0; i < niter; i++) {
    switch (test_type) {
    case 0:
      if (transfer) {
	dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity);
      } else {
	//inv_param.input_location = QUDA_CUDA_FIELD_LOCATION;
	//inv_param.output_location = QUDA_CUDA_FIELD_LOCATION;
	//dslashQuda(cudaSpinorOut->V(), cudaSpinor->V(), &inv_param, parity);
	dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity);
      }
      break;
    case 1:
    case 2:
      if (transfer) {
	MatQuda(spinorOut->V(), spinor->V(), &inv_param);
      } else {
	dirac->M(*cudaSpinorOut, *cudaSpinor);
      }
      break;
    case 3:
    case 4:
      if (transfer) {
	MatDagMatQuda(spinorOut->V(), spinor->V(), &inv_param);
      } else {
	dirac->MdagM(*cudaSpinorOut, *cudaSpinor);
      }
      break;
    }
  }
    
  cudaEventRecord(end, 0);
  cudaEventSynchronize(end);
  float runTime;
  cudaEventElapsedTime(&runTime, start, end);
  cudaEventDestroy(start);
  cudaEventDestroy(end);

  double secs = runTime / 1000; //stopwatchReadSeconds();

  // check for errors
  cudaError_t stat = cudaGetLastError();
  if (stat != cudaSuccess)
    printfQuda("with ERROR: %s\n", cudaGetErrorString(stat));

  return secs;
}
Ejemplo n.º 15
0
  void
display_test_info()
{
  printfQuda("running the following test:\n");

  printfQuda("prec    sloppy_prec    link_recon  sloppy_link_recon test_type  S_dimension T_dimension\n");
  printfQuda("%s   %s             %s            %s            %s         %d/%d/%d          %d \n",
      get_prec_str(prec),get_prec_str(prec_sloppy),
      get_recon_str(link_recon), 
      get_recon_str(link_recon_sloppy), get_test_type(test_type), xdim, ydim, zdim, tdim);     

  printfQuda("Grid partition info:     X  Y  Z  T\n"); 
  printfQuda("                         %d  %d  %d  %d\n", 
      dimPartitioned(0),
      dimPartitioned(1),
      dimPartitioned(2),
      dimPartitioned(3)); 

  return ;

}
Ejemplo n.º 16
0
static int  compareSpinor(const FloatA *u, const FloatB *v, const int volume, 
			  const int N, const int resolution) {
  int fail_check = 16*resolution;
  int *fail = new int[fail_check];
  for (int f=0; f<fail_check; f++) fail[f] = 0;

  int *iter = new int[N];

  for (int i=0; i<N; i++) iter[i] = 0;

  for (int i=0; i<volume; i++) {
    for (int j=0; j<N; j++) {
      int is = i*N+j;
      double diff = fabs(u[is]-v[is]);
      for (int f=0; f<fail_check; f++)
	if (diff > pow(10.0,-(f+1)/(double)resolution)) fail[f]++;
      if (diff > 1e-3) iter[j]++;
    }
  }
    
  for (int i=0; i<N; i++) printfQuda("%d fails = %d\n", i, iter[i]);
    
  int accuracy_level =0;
  for (int f=0; f<fail_check; f++) {
    if (fail[f] == 0){
      accuracy_level = f;
    }
  }

  for (int f=0; f<fail_check; f++) {
    printfQuda("%e Failures: %d / %d  = %e\n", pow(10.0,-(f+1)/(double)resolution), 
	       fail[f], volume*N, fail[f] / (double)(volume*N));
  }
  
  delete []iter;
  delete []fail;
  
  return accuracy_level;
}
Ejemplo n.º 17
0
void
usage(char** argv )
{
  printfQuda("Usage: %s <args>\n", argv[0]);
  printfQuda("--prec         <double/single/half>     Spinor/gauge precision\n"); 
  printfQuda("--prec_sloppy  <double/single/half>     Spinor/gauge sloppy precision\n"); 
  printfQuda("--recon        <8/12>                   Long link reconstruction type\n"); 
  printfQuda("--test         <0/1/2/3/4/5>            Testing type(0=even, 1=odd, 2=full, 3=multimass even,\n" 
	 "                                                     4=multimass odd, 5=multimass full)\n"); 
  printfQuda("--tdim                                  T dimension\n");
  printfQuda("--sdim                                  S dimension\n");
  printfQuda("--help                                  Print out this message\n"); 
  exit(1);
  return ;
}
Ejemplo n.º 18
0
 // Dirac operator factory
 Dirac* Dirac::create(const DiracParam &param)
 {
   if (param.type == QUDA_WILSON_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracWilson operator\n");
     return new DiracWilson(param);
   } else if (param.type == QUDA_WILSONPC_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracWilsonPC operator\n");
     return new DiracWilsonPC(param);
   } else if (param.type == QUDA_CLOVER_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracClover operator\n");
     return new DiracClover(param);
   } else if (param.type == QUDA_CLOVERPC_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracCloverPC operator\n");
     return new DiracCloverPC(param);
   } else if (param.type == QUDA_DOMAIN_WALL_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracDomainWall operator\n");
     return new DiracDomainWall(param);
   } else if (param.type == QUDA_DOMAIN_WALLPC_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracDomainWallPC operator\n");
     return new DiracDomainWallPC(param);
   } else if (param.type == QUDA_ASQTAD_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracStaggered operator\n");
     return new DiracStaggered(param);
   } else if (param.type == QUDA_ASQTADPC_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracStaggeredPC operator\n");
     return new DiracStaggeredPC(param);    
   } else if (param.type == QUDA_TWISTED_MASS_DIRAC) {
     if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracTwistedMass operator (%d flavor(s))\n", param.Ls);
       if (param.Ls == 1) return new DiracTwistedMass(param, 4);
       else return new DiracTwistedMass(param, 5);
     } else if (param.type == QUDA_TWISTED_MASSPC_DIRAC) {
       if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Creating a DiracTwistedMassPC operator (%d flavor(s))\n", param.Ls);
       if (param.Ls == 1) return new DiracTwistedMassPC(param, 4);
       else return new DiracTwistedMassPC(param, 5);
   } else {
     return 0;
   }
 }
Ejemplo n.º 19
0
int main(int argc, char** argv)
{
  for (int i = 1; i < argc; i++) {

    if(process_command_line_option(argc, argv, &i) == 0){
      continue;
    }   



    if( strcmp(argv[i], "--cpu_prec") == 0){
      if (i+1 >= argc){
        usage(argv);
      }
      cpu_prec= get_prec(argv[i+1]);
      i++;
      continue;
    }

    printf("ERROR: Invalid option:%s\n", argv[i]);
    usage(argv);
  }

  if (prec_sloppy == QUDA_INVALID_PRECISION){
    prec_sloppy = prec;
  }
  if (link_recon_sloppy == QUDA_RECONSTRUCT_INVALID){
    link_recon_sloppy = link_recon;
  }

  if(inv_type != QUDA_CG_INVERTER){
    if(test_type != 0 && test_type != 1) errorQuda("Preconditioning is currently not supported in multi-shift solver solvers");
  }


  // initialize QMP/MPI, QUDA comms grid and RNG (test_util.cpp)
  initComms(argc, argv, gridsize_from_cmdline);

  display_test_info();
  
  printfQuda("dslash_type = %d\n", dslash_type);

  int ret = invert_test();

  // finalize the communications layer
  finalizeComms();

  return ret;
}
Ejemplo n.º 20
0
  static void
display_test_info(int test)
{
  printfQuda("running the following test:\n");

  printfQuda("link_precision           link_reconstruct           space_dimension        T_dimension       Test          Ordering\n");
  printfQuda("%s                       %s                         %d/%d/%d/                  %d             %d              %s \n", 
      get_prec_str(prec),
      get_recon_str(link_recon), 
      xdim, ydim, zdim, tdim, test, 
      get_gauge_order_str(gauge_order));

#ifdef MULTI_GPU
  printfQuda("Grid partition info:     X  Y  Z  T\n");
  printfQuda("                         %d  %d  %d  %d\n",
      dimPartitioned(0),
      dimPartitioned(1),
      dimPartitioned(2),
      dimPartitioned(3));
#endif

  return ;

}
Ejemplo n.º 21
0
void
display_test_info()
{
  printfQuda("running the following test:\n");
    
  printfQuda("prec    prec_sloppy   multishift  matpc_type  recon  recon_sloppy S_dimension T_dimension Ls_dimension   dslash_type  normalization\n");
  printfQuda("%6s   %6s          %d     %12s     %2s     %2s         %3d/%3d/%3d     %3d         %2d       %14s  %8s\n",
	     get_prec_str(prec),get_prec_str(prec_sloppy), multishift, get_matpc_str(matpc_type), 
	     get_recon_str(link_recon), 
	     get_recon_str(link_recon_sloppy),  
	     xdim, ydim, zdim, tdim, Lsdim, 
	     get_dslash_str(dslash_type), 
	     get_mass_normalization_str(normalization));     

  printfQuda("Grid partition info:     X  Y  Z  T\n"); 
  printfQuda("                         %d  %d  %d  %d\n", 
	     dimPartitioned(0),
	     dimPartitioned(1),
	     dimPartitioned(2),
	     dimPartitioned(3)); 
  
  return ;
  
}
Ejemplo n.º 22
0
static void massRescaleCoeff(QudaDslashType dslash_type, double &kappa, QudaSolutionType solution_type, 
			     QudaMassNormalization mass_normalization, double &coeff)
{    
  if (dslash_type == QUDA_ASQTAD_DSLASH) {
    if (mass_normalization != QUDA_MASS_NORMALIZATION) {
      errorQuda("Staggered code only supports QUDA_MASS_NORMALIZATION");
    }
    return;
  }

  // multiply the source to compensate for normalization of the Dirac operator, if necessary
  switch (solution_type) {
  case QUDA_MAT_SOLUTION:
    if (mass_normalization == QUDA_MASS_NORMALIZATION ||
	mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
      coeff *= 2.0*kappa;
    }
    break;
  case QUDA_MATDAG_MAT_SOLUTION:
    if (mass_normalization == QUDA_MASS_NORMALIZATION ||
	mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
      coeff *= 4.0*kappa*kappa;
    }
    break;
  case QUDA_MATPC_SOLUTION:
    if (mass_normalization == QUDA_MASS_NORMALIZATION) {
      coeff *= 4.0*kappa*kappa;
    } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
      coeff *= 2.0*kappa;
    }
    break;
  case QUDA_MATPCDAG_MATPC_SOLUTION:
    if (mass_normalization == QUDA_MASS_NORMALIZATION) {
	coeff*=16.0*pow(kappa,4);
    } else if (mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) {
	coeff*=4.0*kappa*kappa;
    }
    break;
  default:
    errorQuda("Solution type %d not supported", solution_type);
  }

  if (verbosity >= QUDA_DEBUG_VERBOSE) printfQuda("Mass rescale done\n");   
}
Ejemplo n.º 23
0
int 
setNumaAffinity(int devid)
{
  int cpu_cores[128];
  int ncores=128;
  int rc = getNumaAffinity(devid, cpu_cores, &ncores);
  if(rc != 0){
    warningQuda("Failed to determine NUMA affinity for device %d (possibly not applicable)", devid);
    return 1;
  }
  int which = devid % ncores;
  printfQuda("Setting NUMA affinity for device %d to CPU core %d\n", devid, cpu_cores[which]);
/*
  for(int i=0;i < ncores;i++){
   if (i != which ) continue;
    printfQuda("%d", cpu_cores[i]);
    if((i+1) < ncores){
      printfQuda(",");
    }
  }
  printfQuda("\n");
  */

  cpu_set_t cpu_set;
  CPU_ZERO(&cpu_set);
  
  for(int i=0;i < ncores;i++){
    if( i != which) continue;
    CPU_SET(cpu_cores[i], &cpu_set);
  }
  
  rc = sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set);
  if (rc != 0){
    warningQuda("Failed to enforce NUMA affinity (probably due to lack of kernel support)");
    return -1;
  }
  
  
  return 0;
}
Ejemplo n.º 24
0
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) 
{
  int k=0;
  int rUpdate = 0;
    
  cudaColorSpinorField r(b);

  ColorSpinorParam param(x);
  param.create = QUDA_ZERO_FIELD_CREATE;
  cudaColorSpinorField y(b, param); 
  
  mat(r, x, y);
  zeroCuda(y);

  double r2 = xmyNormCuda(b, r);
  rUpdate ++;
  
  param.precision = invParam.cuda_prec_sloppy;
  cudaColorSpinorField Ap(x, param);
  cudaColorSpinorField tmp(x, param);
  cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass

  cudaColorSpinorField *x_sloppy, *r_sloppy;
  if (invParam.cuda_prec_sloppy == x.Precision()) {
    param.create = QUDA_REFERENCE_FIELD_CREATE;
    x_sloppy = &x;
    r_sloppy = &r;
  } else {
    param.create = QUDA_COPY_FIELD_CREATE;
    x_sloppy = new cudaColorSpinorField(x, param);
    r_sloppy = new cudaColorSpinorField(r, param);
  }

  cudaColorSpinorField &xSloppy = *x_sloppy;
  cudaColorSpinorField &rSloppy = *r_sloppy;

  cudaColorSpinorField p(rSloppy);

  double r2_old;
  double src_norm = norm2(b);
  double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver

  double alpha, beta;
  double pAp;

  double rNorm = sqrt(r2);
  double r0Norm = rNorm;
  double maxrx = rNorm;
  double maxrr = rNorm;
  double delta = invParam.reliable_delta;

  if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2);

  quda::blas_flops = 0;

  stopwatchStart();
  while (r2 > stop && k<invParam.maxiter) {

    matSloppy(Ap, p, tmp, tmp2); // tmp as tmp
    
    pAp = reDotProductCuda(p, Ap);
    alpha = r2 / pAp;        
    r2_old = r2;
    r2 = axpyNormCuda(-alpha, Ap, rSloppy);

    // reliable update conditions
    rNorm = sqrt(r2);
    if (rNorm > maxrx) maxrx = rNorm;
    if (rNorm > maxrr) maxrr = rNorm;
    int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0;
    int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0;
    
    if (!(updateR || updateX)) {
      beta = r2 / r2_old;
      axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta);
    } else {
      axpyCuda(alpha, p, xSloppy);
      if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
      
      xpyCuda(x, y); // swap these around?
      mat(r, y, x); // here we can use x as tmp
      r2 = xmyNormCuda(b, r);
      if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r);            
      zeroCuda(xSloppy);

      rNorm = sqrt(r2);
      maxrr = rNorm;
      maxrx = rNorm;
      r0Norm = rNorm;      
      rUpdate++;

      beta = r2 / r2_old; 
      xpayCuda(rSloppy, beta, p);
    }

    k++;
    if (invParam.verbosity >= QUDA_VERBOSE)
      printfQuda("CG: %d iterations, r2 = %e\n", k, r2);
  }

  if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy);
  xpyCuda(y, x);

  invParam.secs = stopwatchReadSeconds();

  
  if (k==invParam.maxiter) 
    warningQuda("Exceeded maximum iterations %d", invParam.maxiter);

  if (invParam.verbosity >= QUDA_SUMMARIZE)
    printfQuda("CG: Reliable updates = %d\n", rUpdate);

  double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9;
  reduceDouble(gflops);

  //  printfQuda("%f gflops\n", gflops / stopwatchReadSeconds());
  invParam.gflops = gflops;
  invParam.iter = k;

  quda::blas_flops = 0;

  if (invParam.verbosity >= QUDA_SUMMARIZE){
    mat(r, x, y);
    double true_res = xmyNormCuda(b, r);
    printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", 
	       k, sqrt(r2/src_norm), sqrt(true_res / src_norm));    
  }

  if (invParam.cuda_prec_sloppy != x.Precision()) {
    delete r_sloppy;
    delete x_sloppy;
  }

  return;
}
Ejemplo n.º 25
0
  int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  set_params(&gaugeParam, &inv_param,
      xdim, ydim, zdim, tdim,
      cpu_prec, prec, prec_sloppy,
      link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
      0.8);

  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setSpinorSiteSize(6);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    qdp_fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    qdp_longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  fatlink = malloc(4*V*gaugeSiteSize*gSize);
  longlink = malloc(4*V*gaugeSiteSize*gSize);

  construct_fat_long_gauge_field(qdp_fatlink, qdp_longlink, 1, gaugeParam.cpu_prec, 
				 &gaugeParam, dslash_type);

  const double cos_pi_3 = 0.5; // Cos(pi/3)
  const double sin_pi_3 = sqrt(0.75); // Sin(pi/3)

  for(int dir=0; dir<4; ++dir){
    for(int i=0; i<V; ++i){
      for(int j=0; j<gaugeSiteSize; ++j){
        if(gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
          ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j] = 0.5*rand()/RAND_MAX;
          if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links
            if(j%2 == 0){
              const double real = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j];
              const double imag = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1];
              ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j]     = real*cos_pi_3 - imag*sin_pi_3;
              ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
            }
          }
          ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
          ((double*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_longlink[dir])[i*gaugeSiteSize + j];
        }else{
          ((float*)qdp_fatlink[dir])[i] = 0.5*rand()/RAND_MAX;
          if(link_recon != QUDA_RECONSTRUCT_8 && link_recon != QUDA_RECONSTRUCT_12){ // incorporate non-trivial phase into long links
            if(j%2 == 0){
              const float real = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j];
              const float imag = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1];
              ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j]     = real*cos_pi_3 - imag*sin_pi_3;
              ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j + 1] = real*sin_pi_3 + imag*cos_pi_3;
            }
          }
          ((double*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((double*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
          ((float*)fatlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_fatlink[dir])[i*gaugeSiteSize + j];
          ((float*)longlink)[(i*4 + dir)*gaugeSiteSize + j] = ((float*)qdp_longlink[dir])[i*gaugeSiteSize + j];
        }
      }
    }
  }


  ColorSpinorParam csParam;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;

  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  

  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->V());    
  }else{
    constructSpinorField((double*)in->V());
  }

#ifdef MULTI_GPU
  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
  tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
  tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);

  int fat_pad = tmp_value;
  int link_pad =  3*tmp_value;

  // FIXME: currently assume staggered is SU(3)
  gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? 
    QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
  cpuFat = new cpuGaugeField(cpuFatParam);
  ghost_fatlink = (void**)cpuFat->Ghost();

  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
  cpuLong = new cpuGaugeField(cpuLongParam);
  ghost_longlink = (void**)cpuLong->Ghost();

  gaugeParam.type = dslash_type == QUDA_STAGGERED_DSLASH ? 
    QUDA_SU3_LINKS : QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.ga_pad = fat_pad;
  gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION;
  loadGaugeQuda(fatlink, &gaugeParam);

  if (dslash_type == QUDA_ASQTAD_DSLASH) {
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = link_pad;
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
  }
#else
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  gaugeParam.cuda_prec_precondition = QUDA_HALF_PRECISION;
  loadGaugeQuda(fatlink, &gaugeParam);

  if (dslash_type == QUDA_ASQTAD_DSLASH) {
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.reconstruct = link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
  }
#endif

  double time0 = -((double)clock()); // Start the timer

  double nrm2=0;
  double src2=0;
  int ret = 0;



  switch(test_type){
    case 0: //even
      if(inv_type == QUDA_GCR_INVERTER){
      	inv_param.inv_type = QUDA_GCR_INVERTER;
      	inv_param.gcrNkrylov = 50;
      }else if(inv_type == QUDA_PCG_INVERTER){
	inv_param.inv_type = QUDA_PCG_INVERTER;
      }
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;

      invertQuda(out->V(), in->V(), &inv_param);

      time0 += clock(); 
      time0 /= CLOCKS_PER_SEC;



#ifdef MULTI_GPU    
      matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, 
          out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
      matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
#endif

      mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

      break;

    case 1: //odd
      if(inv_type == QUDA_GCR_INVERTER){
      	inv_param.inv_type = QUDA_GCR_INVERTER;
      	inv_param.gcrNkrylov = 50;
      }else if(inv_type == QUDA_PCG_INVERTER){
	inv_param.inv_type = QUDA_PCG_INVERTER;
      }

      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
      invertQuda(out->V(), in->V(), &inv_param);	
      time0 += clock(); // stop the timer
      time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, 
          out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
      matdagmat(ref->V(), qdp_fatlink, qdp_longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);	
#endif
      mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

      break;

    case 2: //full spinor

      errorQuda("full spinor not supported\n");
      break;

    case 3: //multi mass CG, even
    case 4:

#define NUM_OFFSETS 12

      {    
        double masses[NUM_OFFSETS] ={0.002, 0.0021, 0.0064, 0.070, 0.077, 0.081, 0.1, 0.11, 0.12, 0.13, 0.14, 0.205};
        inv_param.num_offset = NUM_OFFSETS;
        // these can be set independently
        for (int i=0; i<inv_param.num_offset; i++) {
          inv_param.tol_offset[i] = inv_param.tol;
          inv_param.tol_hq_offset[i] = inv_param.tol_hq;
        }
        void* outArray[NUM_OFFSETS];
        int len;

        cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
        spinorOutArray[0] = out;    
        for(int i=1;i < inv_param.num_offset; i++){
          spinorOutArray[i] = new cpuColorSpinorField(csParam);       
        }

        for(int i=0;i < inv_param.num_offset; i++){
          outArray[i] = spinorOutArray[i]->V();
          inv_param.offset[i] = 4*masses[i]*masses[i];
        }

        len=Vh;

        if (test_type == 3) {
          inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
        } else {
          inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
        }

        invertMultiShiftQuda(outArray, in->V(), &inv_param);	

        cudaDeviceSynchronize();
        time0 += clock(); // stop the timer
        time0 /= CLOCKS_PER_SEC;

        printfQuda("done: total time = %g secs, compute time = %g, %i iter / %g secs = %g gflops\n", 
            time0, inv_param.secs, inv_param.iter, inv_param.secs,
            inv_param.gflops/inv_param.secs);


        printfQuda("checking the solution\n");
        QudaParity parity = QUDA_INVALID_PARITY;
        if (inv_param.solve_type == QUDA_NORMOP_SOLVE){
          //parity = QUDA_EVENODD_PARITY;
          errorQuda("full parity not supported\n");
        }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
          parity = QUDA_EVEN_PARITY;
        }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
          parity = QUDA_ODD_PARITY;
        }else{
          errorQuda("ERROR: invalid spinor parity \n");
          exit(1);
        }
        for(int i=0;i < inv_param.num_offset;i++){
          printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
          matdagmat_mg4dir(ref, qdp_fatlink, qdp_longlink, ghost_fatlink, ghost_longlink, 
              spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, 
              gaugeParam.cpu_prec, tmp, parity);
#else
          matdagmat(ref->V(), qdp_fatlink, qdp_longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
#endif
          mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
          double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
          double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
          double hqr = sqrt(HeavyQuarkResidualNormCpu(*spinorOutArray[i], *ref).z);
          double l2r = sqrt(nrm2/src2);

          printfQuda("Shift %d residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
              i, inv_param.tol_offset[i], inv_param.true_res_offset[i], l2r, 
              inv_param.tol_hq_offset[i], inv_param.true_res_hq_offset[i], hqr);

          //emperical, if the cpu residue is more than 1 order the target accuracy, the it fails to converge
          if (sqrt(nrm2/src2) > 10*inv_param.tol_offset[i]){
            ret |=1;
          }
        }

        for(int i=1; i < inv_param.num_offset;i++) delete spinorOutArray[i];
      }
      break;

    default:
      errorQuda("Unsupported test type");

  }//switch

  if (test_type <=2){

    double hqr = sqrt(HeavyQuarkResidualNormCpu(*out, *ref).z);
    double l2r = sqrt(nrm2/src2);

    printfQuda("Residuals: (L2 relative) tol %g, QUDA = %g, host = %g; (heavy-quark) tol %g, QUDA = %g, host = %g\n",
        inv_param.tol, inv_param.true_res, l2r, inv_param.tol_hq, inv_param.true_res_hq, hqr);

    printfQuda("done: total time = %g secs, compute time = %g secs, %i iter / %g secs = %g gflops, \n", 
        time0, inv_param.secs, inv_param.iter, inv_param.secs,
        inv_param.gflops/inv_param.secs);
  }

  end();
  return ret;
}
Ejemplo n.º 26
0
void init() {

  gauge_param = newQudaGaugeParam();
  inv_param = newQudaInvertParam();

  gauge_param.X[0] = 12;
  gauge_param.X[1] = 12;
  gauge_param.X[2] = 12;
  gauge_param.X[3] = 12;
  
  setDims(gauge_param.X, Ls);

  gauge_param.anisotropy = 2.3;

  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;

  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = QUDA_RECONSTRUCT_12;
  gauge_param.reconstruct_sloppy = gauge_param.reconstruct;
  gauge_param.cuda_prec_sloppy = gauge_param.cuda_prec;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
  gauge_param.type = QUDA_WILSON_LINKS;

  inv_param.inv_type = QUDA_CG_INVERTER;

  inv_param.mass = 0.01;
  inv_param.m5 = -1.5;
  kappa5 = 0.5/(5 + inv_param.m5);

  inv_param.Ls = Ls;
  
  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger = dagger;

  inv_param.cpu_prec = cpu_prec;
  inv_param.cuda_prec = cuda_prec;

  gauge_param.ga_pad = 0;
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  if (test_type == 2) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
  }

  inv_param.dslash_type = QUDA_DOMAIN_WALL_DSLASH;

  inv_param.verbosity = QUDA_VERBOSE;

  // construct input fields
  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);

  ColorSpinorParam csParam;
  
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor = 3;
  csParam.nSpin = 4;
  csParam.nDim = 5;
  for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
  csParam.x[4] = Ls;
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  if (test_type < 2) {
    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    csParam.x[0] /= 2;
  } else {
    csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  }    
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;
  
  spinor = new cpuColorSpinorField(csParam);
  spinorOut = new cpuColorSpinorField(csParam);
  spinorRef = new cpuColorSpinorField(csParam);

  csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  csParam.x[0] = gauge_param.X[0];
  
  printfQuda("Randomizing fields... ");

  construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
  spinor->Source(QUDA_RANDOM_SOURCE);

  printfQuda("done.\n"); fflush(stdout);
  
  int dev = 0;
  initQuda(dev);

  printfQuda("Sending gauge field to GPU\n");

  loadGaugeQuda(hostGauge, &gauge_param);

  if (!transfer) {
    csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
    csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
    csParam.pad = inv_param.sp_pad;
    csParam.precision = inv_param.cuda_prec;
    if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
      csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
    } else {
      /* Single and half */
      csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
    }
 
    if (test_type < 2) {
      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
      csParam.x[0] /= 2;
    }

    printfQuda("Creating cudaSpinor\n");
    cudaSpinor = new cudaColorSpinorField(csParam);
    printfQuda("Creating cudaSpinorOut\n");
    cudaSpinorOut = new cudaColorSpinorField(csParam);

    if (test_type == 2) csParam.x[0] /= 2;

    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    tmp = new cudaColorSpinorField(csParam);

    printfQuda("Sending spinor field to GPU\n");
    *cudaSpinor = *spinor;

    std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
      norm2(*cudaSpinor) << std::endl;

    bool pc = (test_type != 2);
    DiracParam diracParam;
    setDiracParam(diracParam, &inv_param, pc);
    diracParam.verbose = QUDA_DEBUG_VERBOSE;
    diracParam.tmp1 = tmp;
    diracParam.tmp2 = tmp2;
    
    dirac = Dirac::create(diracParam);

  } else {
    std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
  }
    
}
Ejemplo n.º 27
0
int
invert_test(void)
{
  QudaGaugeParam gaugeParam = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  double mass = 0.1;

  set_params(&gaugeParam, &inv_param,
	     xdim, ydim, zdim, tdim,
	     cpu_prec, prec, prec_sloppy,
	     link_recon, link_recon_sloppy, mass, tol, 500, 1e-3,
	     0.8);
  
  // this must be before the FaceBuffer is created (this is because it allocates pinned memory - FIXME)
  initQuda(device);

  setDims(gaugeParam.X);
  setDimConstants(gaugeParam.X);

  size_t gSize = (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float);
  for (int dir = 0; dir < 4; dir++) {
    fatlink[dir] = malloc(V*gaugeSiteSize*gSize);
    longlink[dir] = malloc(V*gaugeSiteSize*gSize);
  }
  
  construct_fat_long_gauge_field(fatlink, longlink, 1, gaugeParam.cpu_prec, &gaugeParam);
    
  for (int dir = 0; dir < 4; dir++) {
    for(int i = 0;i < V*gaugeSiteSize;i++){
      if (gaugeParam.cpu_prec == QUDA_DOUBLE_PRECISION){
	((double*)fatlink[dir])[i] = 0.5 *rand()/RAND_MAX;
      }else{
	((float*)fatlink[dir])[i] = 0.5* rand()/RAND_MAX;
      }
    }
  }  
 
  ColorSpinorParam csParam;
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor=3;
  csParam.nSpin=1;
  csParam.nDim=4;
  for(int d = 0; d < 4; d++) {
    csParam.x[d] = gaugeParam.X[d];
  }
  csParam.x[0] /= 2;
  
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder  = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis;
  csParam.create = QUDA_ZERO_FIELD_CREATE;  
  in = new cpuColorSpinorField(csParam);  
  out = new cpuColorSpinorField(csParam);  
  ref = new cpuColorSpinorField(csParam);  
  tmp = new cpuColorSpinorField(csParam);  
  
  if (inv_param.cpu_prec == QUDA_SINGLE_PRECISION){
    constructSpinorField((float*)in->V());    
  }else{
    constructSpinorField((double*)in->V());
  }

  int tmp_value = MAX(ydim*zdim*tdim/2, xdim*zdim*tdim/2);
   tmp_value = MAX(tmp_value, xdim*ydim*tdim/2);
   tmp_value = MAX(tmp_value, xdim*ydim*zdim/2);

  int fat_pad = tmp_value;
  int link_pad =  3*tmp_value;

#ifdef MULTI_GPU
  gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
  gaugeParam.reconstruct = QUDA_RECONSTRUCT_NO;
  GaugeFieldParam cpuFatParam(fatlink, gaugeParam);
  cpuFat = new cpuGaugeField(cpuFatParam);
  cpuFat->exchangeGhost();
  ghost_fatlink = (void**)cpuFat->Ghost();
  
  gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
  GaugeFieldParam cpuLongParam(longlink, gaugeParam);
  cpuLong = new cpuGaugeField(cpuLongParam);
  cpuLong->exchangeGhost();
  ghost_longlink = (void**)cpuLong->Ghost();
#endif
  
  if(testtype == 6){    
    record_gauge(gaugeParam.X, fatlink, fat_pad,
		 longlink, link_pad,
		 link_recon, link_recon_sloppy,
		 &gaugeParam);        
   }else{ 
    
#ifdef MULTI_GPU
 

    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.ga_pad = fat_pad;
    gaugeParam.reconstruct= gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.ga_pad = link_pad;
    gaugeParam.reconstruct= link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
#else
    gaugeParam.type = QUDA_ASQTAD_FAT_LINKS;
    gaugeParam.reconstruct = gaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
    loadGaugeQuda(fatlink, &gaugeParam);
    
    gaugeParam.type = QUDA_ASQTAD_LONG_LINKS;
    gaugeParam.reconstruct = link_recon;
    gaugeParam.reconstruct_sloppy = link_recon_sloppy;
    loadGaugeQuda(longlink, &gaugeParam);
#endif
  }

  
  double time0 = -((double)clock()); // Start the timer
  
  unsigned long volume = Vh;
  unsigned long nflops=2*1187; //from MILC's CG routine
  double nrm2=0;
  double src2=0;
  int ret = 0;

  switch(testtype){
  case 0: //even
    volume = Vh;
    inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
    
    invertQuda(out->V(), in->V(), &inv_param);
    
    time0 += clock(); 
    time0 /= CLOCKS_PER_SEC;

#ifdef MULTI_GPU    
    matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_EVEN_PARITY);
#else
    matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_EVEN_PARITY);
#endif
    
    mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);

    {
      double sol = norm_2(out->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
      double refe = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    }
    break;

  case 1: //odd
	
    volume = Vh;    
    inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
    invertQuda(out->V(), in->V(), &inv_param);	
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
#ifdef MULTI_GPU
    matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		     out, mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, QUDA_ODD_PARITY);
#else
    matdagmat(ref->V(), fatlink, longlink, out->V(), mass, 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), QUDA_ODD_PARITY);	
#endif
    mxpy(in->V(), ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    nrm2 = norm_2(ref->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
    src2 = norm_2(in->V(), Vh*mySpinorSiteSize, inv_param.cpu_prec);
	
    break;
    
  case 2: //full spinor

    errorQuda("full spinor not supported\n");
    break;
    
  case 3: //multi mass CG, even
  case 4:
  case 5:
  case 6:

#define NUM_OFFSETS 7
        
    nflops = 2*(1205 + 15* NUM_OFFSETS); //from MILC's multimass CG routine
    double masses[NUM_OFFSETS] ={5.05, 1.23, 2.64, 2.33, 2.70, 2.77, 2.81};
    double offsets[NUM_OFFSETS];	
    int num_offsets =NUM_OFFSETS;
    void* outArray[NUM_OFFSETS];
    int len;
    
    cpuColorSpinorField* spinorOutArray[NUM_OFFSETS];
    spinorOutArray[0] = out;    
    for(int i=1;i < num_offsets; i++){
      spinorOutArray[i] = new cpuColorSpinorField(csParam);       
    }
    
    for(int i=0;i < num_offsets; i++){
      outArray[i] = spinorOutArray[i]->V();
    }

    for (int i=0; i< num_offsets;i++){
      offsets[i] = 4*masses[i]*masses[i];
    }
    
    len=Vh;
    volume = Vh;      

    if (testtype == 3 || testtype == 6){
      inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;      
    } else if (testtype == 4){
      inv_param.matpc_type = QUDA_MATPC_ODD_ODD;      
    }else { //testtype ==5
      errorQuda("test 5 not supported\n");
    }
    
    double residue_sq;
    if (testtype == 6){
      invertMultiShiftQudaMixed(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq);
    }else{      
      invertMultiShiftQuda(outArray, in->V(), &inv_param, offsets, num_offsets, &residue_sq);	
    }
    cudaThreadSynchronize();
    printfQuda("Final residue squred =%g\n", residue_sq);
    time0 += clock(); // stop the timer
    time0 /= CLOCKS_PER_SEC;
    
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    
    printfQuda("checking the solution\n");
    QudaParity parity;
    if (inv_param.solve_type == QUDA_NORMEQ_SOLVE){
      //parity = QUDA_EVENODD_PARITY;
      errorQuda("full parity not supported\n");
    }else if (inv_param.matpc_type == QUDA_MATPC_EVEN_EVEN){
      parity = QUDA_EVEN_PARITY;
    }else if (inv_param.matpc_type == QUDA_MATPC_ODD_ODD){
      parity = QUDA_ODD_PARITY;
    }else{
      errorQuda("ERROR: invalid spinor parity \n");
      exit(1);
    }
    for(int i=0;i < num_offsets;i++){
      printfQuda("%dth solution: mass=%f, ", i, masses[i]);
#ifdef MULTI_GPU
      matdagmat_mg4dir(ref, fatlink, longlink, ghost_fatlink, ghost_longlink, 
		       spinorOutArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp, parity);
#else
      matdagmat(ref->V(), fatlink, longlink, outArray[i], masses[i], 0, inv_param.cpu_prec, gaugeParam.cpu_prec, tmp->V(), parity);
#endif
      mxpy(in->V(), ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      double nrm2 = norm_2(ref->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      double src2 = norm_2(in->V(), len*mySpinorSiteSize, inv_param.cpu_prec);
      
      printfQuda("relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));

      //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
      if (sqrt(nrm2/src2) > 100*inv_param.tol){
	ret |=1;
      }
    }

    if (ret ==1){
      errorQuda("Converge failed!\n");
    }

    for(int i=1; i < num_offsets;i++){
      delete spinorOutArray[i];
    }

    
  }//switch
    

  if (testtype <=2){

    printfQuda("Relative residual, requested = %g, actual = %g\n", inv_param.tol, sqrt(nrm2/src2));
	
    printfQuda("done: total time = %g secs, %i iter / %g secs = %g gflops, \n", 
	       time0, inv_param.iter, inv_param.secs,
	       inv_param.gflops/inv_param.secs);
    
    //emperical, if the cpu residue is more than 2 order the target accuracy, the it fails to converge
    if (sqrt(nrm2/src2) > 100*inv_param.tol){
      ret = 1;
      errorQuda("Convergence failed!\n");
    }
  }

  end();
  return ret;
}
Ejemplo n.º 28
0
void init(int argc, char **argv) {


  kernelPackT = false; // Set true for kernel T face packing
  cuda_prec= prec;

  gauge_param = newQudaGaugeParam();
  inv_param = newQudaInvertParam();

  gauge_param.X[0] = xdim;
  gauge_param.X[1] = ydim;
  gauge_param.X[2] = zdim;
  gauge_param.X[3] = tdim;
  setDims(gauge_param.X);

  gauge_param.anisotropy = 1.0;

  gauge_param.type = QUDA_WILSON_LINKS;
  gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER;
  gauge_param.t_boundary = QUDA_PERIODIC_T;

  gauge_param.cpu_prec = cpu_prec;
  gauge_param.cuda_prec = cuda_prec;
  gauge_param.reconstruct = link_recon;
  gauge_param.reconstruct_sloppy = link_recon;
  gauge_param.cuda_prec_sloppy = cuda_prec;
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;

  inv_param.kappa = 0.1;

  if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
    inv_param.mu = 0.01;
    inv_param.twist_flavor = QUDA_TWIST_MINUS;
  }

  inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.dagger = dagger;

  inv_param.cpu_prec = cpu_prec;
  if (inv_param.cpu_prec != gauge_param.cpu_prec) 
    errorQuda("Gauge and spinor cpu precisions must match");

  inv_param.cuda_prec = cuda_prec;

#ifndef MULTI_GPU // free parameter for single GPU
  gauge_param.ga_pad = 0;
#else // must be this one c/b face for multi gpu
  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  int pad_size =MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;    
#endif
  inv_param.sp_pad = 0;
  inv_param.cl_pad = 0;

  //inv_param.sp_pad = 24*24*24;
  //inv_param.cl_pad = 24*24*24;

  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // test code only supports DeGrand-Rossi Basis
  inv_param.dirac_order = QUDA_DIRAC_ORDER;

  if (test_type == 2) {
    inv_param.solution_type = QUDA_MAT_SOLUTION;
  } else {
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
  }

  inv_param.dslash_type = dslash_type;

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    inv_param.clover_cpu_prec = cpu_prec;
    inv_param.clover_cuda_prec = cuda_prec;
    inv_param.clover_cuda_prec_sloppy = inv_param.clover_cuda_prec;
    inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER;
    //if (test_type > 0) {
      hostClover = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
      hostCloverInv = hostClover; // fake it
      /*} else {
      hostClover = NULL;
      hostCloverInv = malloc(V*cloverSiteSize*inv_param.clover_cpu_prec);
      }*/
  } else if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {

  }

  //inv_param.verbosity = QUDA_VERBOSE;

  // construct input fields
  for (int dir = 0; dir < 4; dir++) hostGauge[dir] = malloc(V*gaugeSiteSize*gauge_param.cpu_prec);

  ColorSpinorParam csParam;
  
  csParam.fieldLocation = QUDA_CPU_FIELD_LOCATION;
  csParam.nColor = 3;
  csParam.nSpin = 4;
  if (dslash_type == QUDA_TWISTED_MASS_DSLASH) {
    csParam.twistFlavor = inv_param.twist_flavor;
  }
  csParam.nDim = 4;
  for (int d=0; d<4; d++) csParam.x[d] = gauge_param.X[d];
  csParam.precision = inv_param.cpu_prec;
  csParam.pad = 0;
  if (test_type < 2) {
    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    csParam.x[0] /= 2;
  } else {
    csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  }    
  csParam.siteOrder = QUDA_EVEN_ODD_SITE_ORDER;
  csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER;
  csParam.gammaBasis = inv_param.gamma_basis; 
  csParam.create = QUDA_ZERO_FIELD_CREATE;

  //csParam.verbose = QUDA_DEBUG_VERBOSE;

  spinor = new cpuColorSpinorField(csParam);
  spinorOut = new cpuColorSpinorField(csParam);
  spinorRef = new cpuColorSpinorField(csParam);

  csParam.siteSubset = QUDA_FULL_SITE_SUBSET;
  csParam.x[0] = gauge_param.X[0];
  
  printfQuda("Randomizing fields... ");

  if (strcmp(latfile,"")) {  // load in the command line supplied gauge field
    read_gauge_field(latfile, hostGauge, gauge_param.cpu_prec, gauge_param.X, argc, argv);
    construct_gauge_field(hostGauge, 2, gauge_param.cpu_prec, &gauge_param);
  } else { // else generate a random SU(3) field
    construct_gauge_field(hostGauge, 1, gauge_param.cpu_prec, &gauge_param);
  }

  spinor->Source(QUDA_RANDOM_SOURCE);

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    double norm = 0.0; // clover components are random numbers in the range (-norm, norm)
    double diag = 1.0; // constant added to the diagonal

    if (test_type == 2) {
      construct_clover_field(hostClover, norm, diag, inv_param.clover_cpu_prec);
    } else {
      construct_clover_field(hostCloverInv, norm, diag, inv_param.clover_cpu_prec);
    }
  }
  printfQuda("done.\n"); fflush(stdout);
  
  initQuda(device);

  printfQuda("Sending gauge field to GPU\n");
  loadGaugeQuda(hostGauge, &gauge_param);

  if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
    printfQuda("Sending clover field to GPU\n");
    loadCloverQuda(hostClover, hostCloverInv, &inv_param);
    //clover = cudaCloverPrecise;
  }

  if (!transfer) {
    csParam.fieldLocation = QUDA_CUDA_FIELD_LOCATION;
    csParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS;
    csParam.pad = inv_param.sp_pad;
    csParam.precision = inv_param.cuda_prec;
    if (csParam.precision == QUDA_DOUBLE_PRECISION ) {
      csParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER;
    } else {
      /* Single and half */
      csParam.fieldOrder = QUDA_FLOAT4_FIELD_ORDER;
    }
 
    if (test_type < 2) {
      csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
      csParam.x[0] /= 2;
    }

    printfQuda("Creating cudaSpinor\n");
    cudaSpinor = new cudaColorSpinorField(csParam);
    printfQuda("Creating cudaSpinorOut\n");
    cudaSpinorOut = new cudaColorSpinorField(csParam);

    if (test_type == 2) csParam.x[0] /= 2;

    csParam.siteSubset = QUDA_PARITY_SITE_SUBSET;
    tmp1 = new cudaColorSpinorField(csParam);
    if (dslash_type == QUDA_CLOVER_WILSON_DSLASH ||
	dslash_type == QUDA_TWISTED_MASS_DSLASH) {
      tmp2 = new cudaColorSpinorField(csParam);
    }

    printfQuda("Sending spinor field to GPU\n");
    *cudaSpinor = *spinor;

    std::cout << "Source: CPU = " << norm2(*spinor) << ", CUDA = " << 
      norm2(*cudaSpinor) << std::endl;

    bool pc = (test_type != 2);
    DiracParam diracParam;
    setDiracParam(diracParam, &inv_param, pc);
    diracParam.verbose = QUDA_VERBOSE;
    diracParam.tmp1 = tmp1;
    diracParam.tmp2 = tmp2;
    
    dirac = Dirac::create(diracParam);
  } else {
    std::cout << "Source: CPU = " << norm2(*spinor) << std::endl;
  }
    
}
Ejemplo n.º 29
0
int main(int argc, char **argv)
{

  for (int i =1;i < argc; i++){    
    if(process_command_line_option(argc, argv, &i) == 0){
      continue;
    }  
    
    fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
    usage(argv);
  }


  initCommsQuda(argc, argv, gridsize_from_cmdline, 4);

  display_test_info();

  init(argc, argv);

  float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30);
  printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
  
  int attempts = 1;
  dslashRef();
  for (int i=0; i<attempts; i++) {

    if (tune) { // warm-up run
      printfQuda("Tuning...\n");
      setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);
      dslashCUDA(1);
    }
    printfQuda("Executing %d kernel loops...\n", loops);
    dirac->Flops();
    double secs = dslashCUDA(loops);
    printfQuda("done.\n\n");

#ifdef DSLASH_PROFILING
    printDslashProfile();
#endif

    if (!transfer) *spinorOut = *cudaSpinorOut;

    // print timing information
    printfQuda("%fms per loop\n", 1000*secs);
    
    unsigned long long flops = 0;
    if (!transfer) flops = dirac->Flops();
    int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24;
    if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 
      spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short
    int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;
    if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
      gauge_floats += test_type ? 72*2 : 72;
    }
    printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
    printfQuda("GB/s = %f\n\n", 
	       Vh*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9));
    
    if (!transfer) {
      double norm2_cpu = norm2(*spinorRef);
      double norm2_cuda= norm2(*cudaSpinorOut);
      double norm2_cpu_cuda= norm2(*spinorOut);
      printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
    } else {
      double norm2_cpu = norm2(*spinorRef);
      double norm2_cpu_cuda= norm2(*spinorOut);
      printfQuda("Result: CPU = %f, CPU-QUDA = %f\n",  norm2_cpu, norm2_cpu_cuda);
    }
    
    cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
  }    
  end();

  endCommsQuda();
}
Ejemplo n.º 30
0
/*! 
 *
 * Generic version of the multi-shift solver. Should work for
 * most fermions. Note, offset[0] is not folded into the mass parameter 
 */
void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param,
			  double* offsets, int num_offsets, double* residue_sq)
{
  // check the gauge fields have been created
  cudaGaugeField *cudaGauge = checkGauge(param);
  checkInvertParam(param);

  param->num_offset = num_offsets;
  if (param->num_offset > QUDA_MAX_MULTI_SHIFT) 
    errorQuda("Number of shifts %d requested greater than QUDA_MAX_MULTI_SHIFT %d", 
	      param->num_offset, QUDA_MAX_MULTI_SHIFT);
  for (int i=0; i<param->num_offset; i++) {
    param->offset[i] = offsets[i];
    param->tol_offset[i] = residue_sq[i];
  }

  verbosity = param->verbosity;

  // Are we doing a preconditioned solve */
  /* What does NormEq solve mean in the shifted case? 
   */
  if (param->solve_type != QUDA_NORMEQ_PC_SOLVE &&
      param->solve_type != QUDA_NORMEQ_SOLVE) { 
    errorQuda("Direct solve_type is not supported in invertMultiShiftQuda()\n");
  }

  bool pc_solve = (param->solve_type == QUDA_NORMEQ_PC_SOLVE);

  // In principle one can do a MATPC Solution for a hermitian M_pc
  // In practice most of the time I guess one will do a M^\dagger_pc M_pc solution.
  bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION ||
		      param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION );

  // No of GiB in a checkerboard of a spinor
  param->spinorGiB = cudaGauge->VolumeCB() * spinorSiteSize;
  if( !pc_solve) param->spinorGiB *= 2; // Double volume for non PC solve
  
  // **** WARNING *** this may not match implementation... 
  if( param->inv_type == QUDA_CG_INVERTER ) { 
    // CG-M needs 5 vectors for the smallest shift + 2 for each additional shift
    param->spinorGiB *= (5 + 2*(param->num_offset-1))/(double)(1<<30);
  }
  else {
    // BiCGStab-M needs 7 for the original shift + 2 for each additional shift + 1 auxiliary
    // (Jegerlehner hep-lat/9612014 eq (3.13)
    param->spinorGiB *= (7 + 2*(param->num_offset-1))/(double)(1<<30);
  }

  // Timing and FLOP counters
  param->secs = 0;
  param->gflops = 0;
  param->iter = 0;
  
  // Find the smallest shift and its offset.
  double low_offset = param->offset[0];
  int low_index = 0;
  for (int i=1;i < param->num_offset;i++){
    if (param->offset[i] < low_offset){
      low_offset = param->offset[i];
      low_index = i;
    }
  }
  
  // Host pointers for x, take a copy of the input host pointers
  void** hp_x;
  hp_x = new void* [ param->num_offset ];

  void* hp_b = _hp_b;
  for(int i=0;i < param->num_offset;i++){
    hp_x[i] = _hp_x[i];
  }
  
  // Now shift things so that the vector with the smallest shift 
  // is in the first position of the array
  if (low_index != 0){
    void* tmp = hp_x[0];
    hp_x[0] = hp_x[low_index] ;
    hp_x[low_index] = tmp;
    
    double tmp1 = param->offset[0];
    param->offset[0]= param->offset[low_index];
    param->offset[low_index] =tmp1;
  }
    
  // Create the matrix.
  // The way this works is that createDirac will create 'd' and 'dSloppy'
  // which are global. We then grab these with references...
  //
  // Balint: Isn't there a  nice construction pattern we could use here? This is 
  // expedient but yucky.
  DiracParam diracParam; 
  if (param->dslash_type == QUDA_ASQTAD_DSLASH){
    param->mass = sqrt(param->offset[0]/4);  
  }
  createDirac(diracParam, *param, pc_solve);
  Dirac &dirac = *d;
  Dirac &diracSloppy = *dSloppy;

  cpuColorSpinorField *h_b = NULL; // Host RHS
  cpuColorSpinorField **h_x = NULL;
  cudaColorSpinorField *b = NULL;   // Cuda RHS
  cudaColorSpinorField **x = NULL;  // Cuda Solutions

  // Grab the dimension array of the input gauge field.
  const int *X = ( param->dslash_type == QUDA_ASQTAD_DSLASH ) ? 
    gaugeFatPrecise->X() : gaugeFatPrecise->X();

  // Wrap CPU host side pointers
  // 
  // Balint: This creates a ColorSpinorParam struct, from the host data pointer, 
  // the definitions in param, the dimensions X, and whether the solution is on 
  // a checkerboard instruction or not. These can then be used as 'instructions' 
  // to create the actual colorSpinorField
  ColorSpinorParam cpuParam(hp_b, *param, X, pc_solution);
  h_b = new cpuColorSpinorField(cpuParam);

  h_x = new cpuColorSpinorField* [ param->num_offset ]; // DYNAMIC ALLOCATION
  for(int i=0; i < param->num_offset; i++) { 
    cpuParam.v = hp_x[i];
    h_x[i] = new cpuColorSpinorField(cpuParam);
  }

  // Now I need a colorSpinorParam for the device
  ColorSpinorParam cudaParam(cpuParam, *param);
  // This setting will download a host vector
  cudaParam.create = QUDA_COPY_FIELD_CREATE;
  b = new cudaColorSpinorField(*h_b, cudaParam); // Creates b and downloads h_b to it

  // Create the solution fields filled with zero
  x = new cudaColorSpinorField* [ param->num_offset ];
  cudaParam.create = QUDA_ZERO_FIELD_CREATE;
  for(int i=0; i < param->num_offset; i++) { 
    x[i] = new cudaColorSpinorField(cudaParam);
  }

  // Check source norms
  if( param->verbosity >= QUDA_VERBOSE ) {
    double nh_b = norm2(*h_b);
    double nb = norm2(*b);
    printfQuda("Source: CPU= %f, CUDA copy = %f\n", nh_b,nb);
  }

  // tune the Dirac Kernel
  tuneDirac(*param, pc_solution ? *(x[0]) : (x[0])->Even());
  
  
  massRescale(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, *b);
  double *rescaled_shifts = new double [param->num_offset];
  for(int i=0; i < param->num_offset; i++){ 
    rescaled_shifts[i] = param->offset[i];
    massRescaleCoeff(param->dslash_type, diracParam.kappa, param->solution_type, param->mass_normalization, rescaled_shifts[i]);
  }

  {
    DiracMdagM m(dirac), mSloppy(diracSloppy);
    MultiShiftCG cg_m(m, mSloppy, *param);
    cg_m(x, *b);  
  }

  delete [] rescaled_shifts;

  for(int i=0; i < param->num_offset; i++) { 
    x[i]->saveCPUSpinorField(*h_x[i]);
  }

  for(int i=0; i < param->num_offset; i++){ 
    delete h_x[i];
    delete x[i];
  }

  delete h_b;
  delete b;

  delete [] h_x;
  delete [] x;

  delete [] hp_x;

  if (!param->preserve_dirac) {
    delete d; d =NULL;
    delete dSloppy; dSloppy = NULL;
    delete dPre; dPre = NULL;
    diracCreation = false;
    diracTune = false;
  }  

  return;
}