Exemple #1
0
// Find the best block size parameters for the Dslash and DslashXpay kernels
void DiracClover::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, 
		       const cudaColorSpinorField &x) {

  DiracWilson::Tune(out, in, x);

  setDslashTuning(QUDA_TUNE_YES);

  { // Tune clover application
    TuneDiracClover cloverTune(*this, out, in);
    cloverTune.Benchmark(tuneClover);
  }

  setDslashTuning(QUDA_TUNE_NO);
}
static int dslashTest() 
{
  int accuracy_level = 0;
  
  init();
    
  int attempts = 1;
    
  for (int i=0; i<attempts; i++) {

    if (tune) { // warm-up run
      printfQuda("Tuning...\n");
      setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);      
      dslashCUDA(1);
    }
    printfQuda("Executing %d kernel loops...", loops);	
    double secs = dslashCUDA(loops);

#ifdef DSLASH_PROFILING
    printDslashProfile();
#endif
    
    if (!transfer) *spinorOut = *cudaSpinorOut;
      
    printfQuda("\n%fms per loop\n", 1000*secs);
    staggeredDslashRef();
	
    unsigned long long flops = dirac->Flops();
    int link_floats = 8*gaugeParam.reconstruct+8*18;
    int spinor_floats = 8*6*2 + 6;
    int link_float_size = prec;
    int spinor_float_size = 0;
    
    link_floats = test_type ? (2*link_floats) : link_floats;
    spinor_floats = test_type ? (2*spinor_floats) : spinor_floats;

    int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size;
    if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4;	

    printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
    printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9));
	
    if (!transfer) {
      double spinor_ref_norm2 = norm2(*spinorRef);
      double cuda_spinor_out_norm2 =  norm2(*cudaSpinorOut);
      double spinor_out_norm2 =  norm2(*spinorOut);
      printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n",  spinor_ref_norm2, cuda_spinor_out_norm2,
		 spinor_out_norm2);
    } else {
      double spinor_ref_norm2 = norm2(*spinorRef);
      double spinor_out_norm2 =  norm2(*spinorOut);
      printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2);
    }
    
    accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut);	
  }
  end();
  
  return accuracy_level;
}
Exemple #3
0
// Find the best block size parameters for the Dslash and DslashXpay kernels
void DiracDomainWall::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, 
			   const cudaColorSpinorField &x) {

  setDslashTuning(QUDA_TUNE_YES);

  { // Tune Dslash
    TuneDiracDomainWallDslash dslashTune(*this, out, in);
    dslashTune.Benchmark(tuneDslash[0]);
    for (int i=0; i<4; i++) 
      if (commDimPartitioned(i)) 
	dslashTune.Benchmark(tuneDslash[i+1]);
  }

  { // Tune DslashXpay
    TuneDiracDomainWallDslashXpay dslashXpayTune(*this, out, in, x);
    dslashXpayTune.Benchmark(tuneDslashXpay[0]);
    for (int i=0; i<4; i++) 
      if (commDimPartitioned(i)) 
	dslashXpayTune.Benchmark(tuneDslashXpay[i+1]);
  }

  setDslashTuning(QUDA_TUNE_NO);
}
int main(int argc, char **argv)
{
  init();

  float spinorGiB = (float)Vh*Ls*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30);
  printf("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
  
  int attempts = 1;
  dslashRef();

  for (int i=0; i<attempts; i++) {
    
    if (tune) { // warm-up run
      printfQuda("Tuning...\n");
      setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);      
      dslashCUDA(1);
    }

    double secs = dslashCUDA();

    if (!transfer) *spinorOut = *cudaSpinorOut;

    // print timing information
    printf("%fms per loop\n", 1000*secs);
    
    unsigned long long flops = 0;
    if (!transfer) flops = dirac->Flops();

    int spinor_floats = test_type ? 2*(9*24+24)+24 : 9*24+24;
    if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 
      spinor_floats += test_type ? 2*(9*2 + 2) + 2 : 9*2 + 2; // relative size of norm is twice a short
    int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;

    printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
    printfQuda("GB/s = %f\n\n", 
	       (float)Vh*Ls*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9));


    if (!transfer) {
      std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << 
	", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
    } else {
      std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl;
    }
    
    cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
  }    
  end();
}
Exemple #5
0
// Find the best block size parameters for the Dslash and DslashXpay kernels
void DiracCloverPC::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, 
		       const cudaColorSpinorField &x) {
  DiracClover::Tune(out, in, x);

  setDslashTuning(QUDA_TUNE_YES);

  { // Tune Dslash
    TuneDiracCloverDslash dslashTune(*this, out, in);
    dslashTune.Benchmark(blockDslash);
#ifdef OVERLAP_COMMS
    dslashTune.Benchmark(blockDslashFace);
#endif
  }

  { // Tune DslashXpay
    TuneDiracCloverDslashXpay dslashXpayTune(*this, out, in, x);
    dslashXpayTune.Benchmark(blockDslashXpay);
#ifdef OVERLAP_COMMS
    dslashXpayTune.Benchmark(blockDslashXpayFace);
#endif
  }

  setDslashTuning(QUDA_TUNE_NO);
}
int main(int argc, char **argv)
{

  for (int i =1;i < argc; i++){    
    if(process_command_line_option(argc, argv, &i) == 0){
      continue;
    }  
    
    fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]);
    usage(argv);
  }


  initCommsQuda(argc, argv, gridsize_from_cmdline, 4);

  display_test_info();

  init(argc, argv);

  float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30);
  printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB);
  printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB);
  
  int attempts = 1;
  dslashRef();
  for (int i=0; i<attempts; i++) {

    if (tune) { // warm-up run
      printfQuda("Tuning...\n");
      setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE);
      dslashCUDA(1);
    }
    printfQuda("Executing %d kernel loops...\n", loops);
    dirac->Flops();
    double secs = dslashCUDA(loops);
    printfQuda("done.\n\n");

#ifdef DSLASH_PROFILING
    printDslashProfile();
#endif

    if (!transfer) *spinorOut = *cudaSpinorOut;

    // print timing information
    printfQuda("%fms per loop\n", 1000*secs);
    
    unsigned long long flops = 0;
    if (!transfer) flops = dirac->Flops();
    int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24;
    if (inv_param.cuda_prec == QUDA_HALF_PRECISION) 
      spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short
    int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct;
    if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) {
      gauge_floats += test_type ? 72*2 : 72;
    }
    printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs);
    printfQuda("GB/s = %f\n\n", 
	       Vh*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9));
    
    if (!transfer) {
      double norm2_cpu = norm2(*spinorRef);
      double norm2_cuda= norm2(*cudaSpinorOut);
      double norm2_cpu_cuda= norm2(*spinorOut);
      printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda);
    } else {
      double norm2_cpu = norm2(*spinorRef);
      double norm2_cpu_cuda= norm2(*spinorOut);
      printfQuda("Result: CPU = %f, CPU-QUDA = %f\n",  norm2_cpu, norm2_cpu_cuda);
    }
    
    cpuColorSpinorField::Compare(*spinorRef, *spinorOut);
  }    
  end();

  endCommsQuda();
}