// Find the best block size parameters for the Dslash and DslashXpay kernels void DiracClover::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, const cudaColorSpinorField &x) { DiracWilson::Tune(out, in, x); setDslashTuning(QUDA_TUNE_YES); { // Tune clover application TuneDiracClover cloverTune(*this, out, in); cloverTune.Benchmark(tuneClover); } setDslashTuning(QUDA_TUNE_NO); }
static int dslashTest() { int accuracy_level = 0; init(); int attempts = 1; for (int i=0; i<attempts; i++) { if (tune) { // warm-up run printfQuda("Tuning...\n"); setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); dslashCUDA(1); } printfQuda("Executing %d kernel loops...", loops); double secs = dslashCUDA(loops); #ifdef DSLASH_PROFILING printDslashProfile(); #endif if (!transfer) *spinorOut = *cudaSpinorOut; printfQuda("\n%fms per loop\n", 1000*secs); staggeredDslashRef(); unsigned long long flops = dirac->Flops(); int link_floats = 8*gaugeParam.reconstruct+8*18; int spinor_floats = 8*6*2 + 6; int link_float_size = prec; int spinor_float_size = 0; link_floats = test_type ? (2*link_floats) : link_floats; spinor_floats = test_type ? (2*spinor_floats) : spinor_floats; int bytes_for_one_site = link_floats * link_float_size + spinor_floats * spinor_float_size; if (prec == QUDA_HALF_PRECISION) bytes_for_one_site += (8*2 + 1)*4; printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); printfQuda("GB/s = %f\n\n", 1.0*Vh*bytes_for_one_site/((secs/loops)*1e+9)); if (!transfer) { double spinor_ref_norm2 = norm2(*spinorRef); double cuda_spinor_out_norm2 = norm2(*cudaSpinorOut); double spinor_out_norm2 = norm2(*spinorOut); printfQuda("Results: CPU=%f, CUDA=%f, CPU-CUDA=%f\n", spinor_ref_norm2, cuda_spinor_out_norm2, spinor_out_norm2); } else { double spinor_ref_norm2 = norm2(*spinorRef); double spinor_out_norm2 = norm2(*spinorOut); printfQuda("Result: CPU=%f , CPU-CUDA=%f", spinor_ref_norm2, spinor_out_norm2); } accuracy_level = cpuColorSpinorField::Compare(*spinorRef, *spinorOut); } end(); return accuracy_level; }
// Find the best block size parameters for the Dslash and DslashXpay kernels void DiracDomainWall::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, const cudaColorSpinorField &x) { setDslashTuning(QUDA_TUNE_YES); { // Tune Dslash TuneDiracDomainWallDslash dslashTune(*this, out, in); dslashTune.Benchmark(tuneDslash[0]); for (int i=0; i<4; i++) if (commDimPartitioned(i)) dslashTune.Benchmark(tuneDslash[i+1]); } { // Tune DslashXpay TuneDiracDomainWallDslashXpay dslashXpayTune(*this, out, in, x); dslashXpayTune.Benchmark(tuneDslashXpay[0]); for (int i=0; i<4; i++) if (commDimPartitioned(i)) dslashXpayTune.Benchmark(tuneDslashXpay[i+1]); } setDslashTuning(QUDA_TUNE_NO); }
int main(int argc, char **argv) { init(); float spinorGiB = (float)Vh*Ls*spinorSiteSize*sizeof(inv_param.cpu_prec) / (1 << 30); printf("\nSpinor mem: %.3f GiB\n", spinorGiB); printf("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); int attempts = 1; dslashRef(); for (int i=0; i<attempts; i++) { if (tune) { // warm-up run printfQuda("Tuning...\n"); setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); dslashCUDA(1); } double secs = dslashCUDA(); if (!transfer) *spinorOut = *cudaSpinorOut; // print timing information printf("%fms per loop\n", 1000*secs); unsigned long long flops = 0; if (!transfer) flops = dirac->Flops(); int spinor_floats = test_type ? 2*(9*24+24)+24 : 9*24+24; if (inv_param.cuda_prec == QUDA_HALF_PRECISION) spinor_floats += test_type ? 2*(9*2 + 2) + 2 : 9*2 + 2; // relative size of norm is twice a short int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct; printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); printfQuda("GB/s = %f\n\n", (float)Vh*Ls*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9)); if (!transfer) { std::cout << "Results: CPU = " << norm2(*spinorRef) << ", CUDA = " << norm2(*cudaSpinorOut) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; } else { std::cout << "Result: CPU = " << norm2(*spinorRef) << ", CPU-CUDA = " << norm2(*spinorOut) << std::endl; } cpuColorSpinorField::Compare(*spinorRef, *spinorOut); } end(); }
// Find the best block size parameters for the Dslash and DslashXpay kernels void DiracCloverPC::Tune(cudaColorSpinorField &out, const cudaColorSpinorField &in, const cudaColorSpinorField &x) { DiracClover::Tune(out, in, x); setDslashTuning(QUDA_TUNE_YES); { // Tune Dslash TuneDiracCloverDslash dslashTune(*this, out, in); dslashTune.Benchmark(blockDslash); #ifdef OVERLAP_COMMS dslashTune.Benchmark(blockDslashFace); #endif } { // Tune DslashXpay TuneDiracCloverDslashXpay dslashXpayTune(*this, out, in, x); dslashXpayTune.Benchmark(blockDslashXpay); #ifdef OVERLAP_COMMS dslashXpayTune.Benchmark(blockDslashXpayFace); #endif } setDslashTuning(QUDA_TUNE_NO); }
int main(int argc, char **argv) { for (int i =1;i < argc; i++){ if(process_command_line_option(argc, argv, &i) == 0){ continue; } fprintf(stderr, "ERROR: Invalid option:%s\n", argv[i]); usage(argv); } initCommsQuda(argc, argv, gridsize_from_cmdline, 4); display_test_info(); init(argc, argv); float spinorGiB = (float)Vh*spinorSiteSize*inv_param.cuda_prec / (1 << 30); printfQuda("\nSpinor mem: %.3f GiB\n", spinorGiB); printfQuda("Gauge mem: %.3f GiB\n", gauge_param.gaugeGiB); int attempts = 1; dslashRef(); for (int i=0; i<attempts; i++) { if (tune) { // warm-up run printfQuda("Tuning...\n"); setDslashTuning(QUDA_TUNE_YES, QUDA_VERBOSE); dslashCUDA(1); } printfQuda("Executing %d kernel loops...\n", loops); dirac->Flops(); double secs = dslashCUDA(loops); printfQuda("done.\n\n"); #ifdef DSLASH_PROFILING printDslashProfile(); #endif if (!transfer) *spinorOut = *cudaSpinorOut; // print timing information printfQuda("%fms per loop\n", 1000*secs); unsigned long long flops = 0; if (!transfer) flops = dirac->Flops(); int spinor_floats = test_type ? 2*(7*24+24)+24 : 7*24+24; if (inv_param.cuda_prec == QUDA_HALF_PRECISION) spinor_floats += test_type ? 2*(7*2 + 2) + 2 : 7*2 + 2; // relative size of norm is twice a short int gauge_floats = (test_type ? 2 : 1) * (gauge_param.gauge_fix ? 6 : 8) * gauge_param.reconstruct; if (dslash_type == QUDA_CLOVER_WILSON_DSLASH) { gauge_floats += test_type ? 72*2 : 72; } printfQuda("GFLOPS = %f\n", 1.0e-9*flops/secs); printfQuda("GB/s = %f\n\n", Vh*(spinor_floats+gauge_floats)*inv_param.cuda_prec/((secs/loops)*1e+9)); if (!transfer) { double norm2_cpu = norm2(*spinorRef); double norm2_cuda= norm2(*cudaSpinorOut); double norm2_cpu_cuda= norm2(*spinorOut); printfQuda("Results: CPU = %f, CUDA=%f, CPU-CUDA = %f\n", norm2_cpu, norm2_cuda, norm2_cpu_cuda); } else { double norm2_cpu = norm2(*spinorRef); double norm2_cpu_cuda= norm2(*spinorOut); printfQuda("Result: CPU = %f, CPU-QUDA = %f\n", norm2_cpu, norm2_cpu_cuda); } cpuColorSpinorField::Compare(*spinorRef, *spinorOut); } end(); endCommsQuda(); }