Esempio n. 1
0
void Tuner(int argc, char* argv[]) {
  constexpr auto kSeed = 42; // fixed seed for reproducibility

  // Sets the parameters and platform/device for which to tune (command-line options)
  auto command_line_args = RetrieveCommandLineArguments(argc, argv);
  auto help = std::string{"* Options given/available:\n"};
  auto args = Arguments<T>{};
  args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
  args.device_id   = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
  args.precision   = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
  for (auto &o: C::GetOptions()) {
    if (o == kArgM)        { args.m        = GetArgument(command_line_args, help, kArgM, C::DefaultM()); }
    if (o == kArgN)        { args.n        = GetArgument(command_line_args, help, kArgN, C::DefaultN()); }
    if (o == kArgK)        { args.k        = GetArgument(command_line_args, help, kArgK, C::DefaultK()); }
    if (o == kArgAlpha)    { args.alpha    = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
    if (o == kArgBeta)     { args.beta     = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
    if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, C::DefaultFraction()); }
    if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, C::DefaultBatchCount()); }
    if (o == tStrategy)   {args.tStrategy   = GetArgument(command_line_args, help, tStrategy, DEFAULT_STRATEGY);  }
    if (o == psoSwarmSize)   {args.psoSwarmSize   = GetArgument(command_line_args, help, psoSwarmSize, DEFAULT_PSO_SWARM);  }
    if (o == psoInfG)   {args.psoInfG   = GetArgument(command_line_args, help, psoInfG, DEFAULT_PSO_G);  }
    if (o == psoInfL)   {args.psoInfL   = GetArgument(command_line_args, help, psoInfL, DEFAULT_PSO_L);  }
    if (o == psoInfR)   {args.psoInfR   = GetArgument(command_line_args, help, psoInfR, DEFAULT_PSO_R);  }
  }
  const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, C::DefaultNumRuns());

  fprintf(stdout, "%s\n", help.c_str());

  // Tests validity of the given arguments
  C::TestValidArguments(args);

  // Tests for validity of the precision and retrieves properties
  auto isAMD = false;
  auto isARM = false;
  auto isGPU = false;
  {
    const auto platform = Platform(args.platform_id);
    const auto device = Device(platform, args.device_id);
    if (!PrecisionSupported<T>(device)) {
      printf("* Unsupported precision, skipping this tuning run\n\n");
      return;
    }
    isAMD = device.IsAMD();
    isARM = device.IsARM();
    isGPU = device.IsGPU();
  }

  // Creates input buffers with random data
  auto x_vec = std::vector<T>(C::GetSizeX(args));
  auto y_vec = std::vector<T>(C::GetSizeY(args));
  auto a_mat = std::vector<T>(C::GetSizeA(args));
  auto b_mat = std::vector<T>(C::GetSizeB(args));
  auto c_mat = std::vector<T>(C::GetSizeC(args));
  auto temp = std::vector<T>(C::GetSizeTemp(args));
  std::mt19937 mt(kSeed);
  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
  PopulateVector(x_vec, mt, dist);
  PopulateVector(y_vec, mt, dist);
  PopulateVector(a_mat, mt, dist);
  PopulateVector(b_mat, mt, dist);
  PopulateVector(c_mat, mt, dist);
  PopulateVector(temp, mt, dist);

  // Initializes the tuner for the chosen device
  cltune::Tuner tuner(args.platform_id, args.device_id);

  // Use full-search to explore all parameter combinations or random-search to search only a part of
  // the parameter values. The fraction is set as a command-line argument.
  #ifdef XGEMM_EXEC
  
  if(tStrategyFlag)
  {
   auto localtStrategy = args.tStrategy;  

    if (args.fraction == 1.0 || args.fraction == 0.0) 
    { 
     localtStrategy = FULL_SEARCH_STRATEGY; 
    }
    switch (localtStrategy)
    {
      case FULL_SEARCH_STRATEGY: 
        tuner.UseFullSearch();
        break;

      case RANDOM_SEARCH_STRATEGY: 
          tuner.UseRandomSearch(1.0/args.fraction);
        break;
      case PSO_STRATEGY: 
          tuner.UsePSO(1.0/args.fraction, args.psoSwarmSize, args.psoInfG, args.psoInfL, args.psoInfR);
        break;
      case DVDT_STRATEGY:
      default: 
        tuner.UseFullSearch();
    }
  }

  #else

  if (args.fraction == 1.0 || args.fraction == 0.0) 
  {
    tuner.UseFullSearch();
  }
  else 
  {
    tuner.UseRandomSearch(1.0/args.fraction);
  }

  #endif
  // Set extra settings for specific defines. This mimics src/routine.cc.
  auto defines = std::string{""};
  if (isAMD && isGPU) {
    defines += "#define USE_CL_MAD 1\n";
    defines += "#define USE_STAGGERED_INDICES 1\n";
  }
  if (isARM && isGPU) {
    defines += "#define GLOBAL_MEM_FENCE 1\n";
  }

  // Loads the kernel sources and defines the kernel to tune
  auto sources = defines + C::GetSources();
  auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
  tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef());

  // Sets the tunable parameters and their possible values
  C::SetParameters(tuner, id);
  C::SetConstraints(tuner, id);
  C::SetLocalMemorySize(tuner, id, args);

  // Tests for a specific precision
  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));

  // Modifies the thread-sizes (both global and local) based on the parameters
  for (auto &parameters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); }
  for (auto &parameters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); }
  for (auto &parameters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); }
  for (auto &parameters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); }

  // Sets the function's arguments
  C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);

  // Starts the tuning process
  tuner.SetNumRuns(num_runs);
  tuner.Tune();

  // Prints the results to screen
  auto time_ms = tuner.PrintToScreen();
  tuner.PrintFormatted();

  // Also prints the performance of the best-case in terms of GB/s or GFLOPS
  if (time_ms != 0.0) {
    printf("[ -------> ] %.2lf ms", time_ms);
    printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
  }

  // Outputs the results as JSON to disk, including some meta-data
  auto precision_string = std::to_string(static_cast<size_t>(args.precision));
  auto metadata = std::vector<std::pair<std::string,std::string>>{
    {"kernel_family", C::KernelFamily()},
    {"precision", precision_string}
  };
  for (auto &o: C::GetOptions()) {
    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
    if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
    if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
    if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
  }
  tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
 

}
Esempio n. 2
0
size_t RunOverrideTests(int argc, char *argv[], const bool silent, const std::string &routine_name) {
  auto arguments = RetrieveCommandLineArguments(argc, argv);
  auto errors = size_t{0};
  auto passed = size_t{0};
  auto example_routine = TestXgemm<0, T>();
  constexpr auto kSeed = 42; // fixed seed for reproducibility

  // Determines the test settings
  const auto kernel_name = std::string{"Xgemm"};
  const auto precision = PrecisionValue<T>();
  const auto valid_settings = std::vector<std::unordered_map<std::string,size_t>>{
    { {"GEMMK",0}, {"KREG",1}, {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} },
    { {"GEMMK",0}, {"KREG",1}, {"KWG",32}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",32}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",32}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} },
    { {"GEMMK",0}, {"KREG",1}, {"KWG",16}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0}, {"SB",0}, {"STRM",0}, {"STRN",0}, {"VWM",1}, {"VWN",1} },
  };
  const auto invalid_settings = std::vector<std::unordered_map<std::string,size_t>>{
    { {"GEMMK",0}, {"KREG",1}, {"KWI",2}, {"MDIMA",4}, {"MDIMC",4}, {"MWG",16}, {"NDIMB",4}, {"NDIMC",4}, {"NWG",16}, {"SA",0} },
  };

  // Retrieves the arguments
  auto help = std::string{"Options given/available:\n"};
  const auto platform_id = GetArgument(arguments, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
  const auto device_id = GetArgument(arguments, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
  auto args = Arguments<T>{};
  args.m = GetArgument(arguments, help, kArgM, size_t{256});
  args.n = GetArgument(arguments, help, kArgN, size_t{256});
  args.k = GetArgument(arguments, help, kArgK, size_t{256});
  args.a_ld = GetArgument(arguments, help, kArgALeadDim, args.k);
  args.b_ld = GetArgument(arguments, help, kArgBLeadDim, args.n);
  args.c_ld = GetArgument(arguments, help, kArgCLeadDim, args.n);
  args.a_offset = GetArgument(arguments, help, kArgAOffset, size_t{0});
  args.b_offset = GetArgument(arguments, help, kArgBOffset, size_t{0});
  args.c_offset = GetArgument(arguments, help, kArgCOffset, size_t{0});
  args.layout = GetArgument(arguments, help, kArgLayout, Layout::kRowMajor);
  args.a_transpose = GetArgument(arguments, help, kArgATransp, Transpose::kNo);
  args.b_transpose = GetArgument(arguments, help, kArgBTransp, Transpose::kNo);
  args.kernel_mode = GetArgument(arguments, help, kArgKernelMode, KernelMode::kCrossCorrelation);
  args.alpha = GetArgument(arguments, help, kArgAlpha, GetScalar<T>());
  args.beta  = GetArgument(arguments, help, kArgBeta, GetScalar<T>());

  // Prints the help message (command-line arguments)
  if (!silent) { fprintf(stdout, "\n* %s\n", help.c_str()); }

  // Initializes OpenCL
  const auto platform = Platform(platform_id);
  const auto device = Device(platform, device_id);
  const auto context = Context(device);
  auto queue = Queue(context, device);

  // Populate host matrices with some example data
  auto host_a = std::vector<T>(args.m * args.k);
  auto host_b = std::vector<T>(args.n * args.k);
  auto host_c = std::vector<T>(args.m * args.n);
  std::mt19937 mt(kSeed);
  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
  PopulateVector(host_a, mt, dist);
  PopulateVector(host_b, mt, dist);
  PopulateVector(host_c, mt, dist);

  // Copy the matrices to the device
  auto device_a = Buffer<T>(context, host_a.size());
  auto device_b = Buffer<T>(context, host_b.size());
  auto device_c = Buffer<T>(context, host_c.size());
  auto device_temp = Buffer<T>(context, args.m * args.n * args.k); // just to be safe
  device_a.Write(queue, host_a.size(), host_a);
  device_b.Write(queue, host_b.size(), host_b);
  device_c.Write(queue, host_c.size(), host_c);
  auto dummy = Buffer<T>(context, 1);
  auto buffers = Buffers<T>{dummy, dummy, device_a, device_b, device_c, device_temp, dummy};

  // Loops over the valid combinations: run before and run afterwards
  fprintf(stdout, "* Testing OverrideParameters for '%s'\n", routine_name.c_str());
  for (const auto &override_setting : valid_settings) {
    const auto status_before = example_routine.RunRoutine(args, buffers, queue);
    if (status_before != StatusCode::kSuccess) { errors++; continue; }

    // Overrides the parameters
    const auto status = OverrideParameters(device(), kernel_name, precision, override_setting);
    if (status != StatusCode::kSuccess) { errors++; continue; } // error shouldn't occur

    const auto status_after = example_routine.RunRoutine(args, buffers, queue);
    if (status_after != StatusCode::kSuccess) { errors++; continue; }
    passed++;
  }

  // Loops over the invalid combinations: run before and run afterwards
  for (const auto &override_setting : invalid_settings) {
    const auto status_before = example_routine.RunRoutine(args, buffers, queue);
    if (status_before != StatusCode::kSuccess) { errors++; continue; }

    // Overrides the parameters
    const auto status = OverrideParameters(device(), kernel_name, precision, override_setting);
    if (status == StatusCode::kSuccess) { errors++; continue; } // error should occur

    const auto status_after = example_routine.RunRoutine(args, buffers, queue);
    if (status_after != StatusCode::kSuccess) { errors++; continue; }
    passed++;
  }

  // Prints and returns the statistics
  std::cout << "    " << passed << " test(s) passed" << std::endl;
  std::cout << "    " << errors << " test(s) failed" << std::endl;
  std::cout << std::endl;
  return errors;
}