Beispiel #1
0
void
compute_imbalance(const int global_box[][2],
                  const int local_box[][2],
                  float& largest_imbalance,
                  float& std_dev,
                  YAML_Doc& doc,
                  bool record_in_doc)
{
  int numprocs = 1, myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
  int min_proc = myproc, max_proc = myproc;
  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
                     max_nrows, max_proc);

  float avg_nrows = global_nrows;
  avg_nrows /= numprocs;

  //largest_imbalance will be the difference between the min (or max)
  //rows-per-processor and avg_nrows, represented as a percentage:
  largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows);

  float tmp = percentage_difference<float>(max_nrows, avg_nrows);
  if (tmp > largest_imbalance) largest_imbalance = tmp;

  std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows);

  if (myproc == 0 && record_in_doc) {
    doc.add("Rows-per-proc Load Imbalance","");
    doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance);
    doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev);
  }
}
Beispiel #2
0
void add_configuration_to_yaml(YAML_Doc& doc, int numprocs)
{
  doc.get("Global Run Parameters")->add("number of processors", numprocs);

  doc.add("Platform","");
  doc.get("Platform")->add("hostname",MINIFE_HOSTNAME);
  doc.get("Platform")->add("kernel name",MINIFE_KERNEL_NAME);
  doc.get("Platform")->add("kernel release",MINIFE_KERNEL_RELEASE);
  doc.get("Platform")->add("processor",MINIFE_PROCESSOR);

  doc.add("Build","");
  doc.get("Build")->add("CXX",MINIFE_CXX);
#if MINIFE_INFO != 0
  doc.get("Build")->add("compiler version",MINIFE_CXX_VERSION);
#endif
  doc.get("Build")->add("CXXFLAGS",MINIFE_CXXFLAGS);
  std::string using_mpi("no");
#ifdef HAVE_MPI
  using_mpi = "yes";
#endif
  doc.get("Build")->add("using MPI",using_mpi);
}
Beispiel #3
0
void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
{
  doc.add("Global Run Parameters","");
  doc.get("Global Run Parameters")->add("dimensions","");
  doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx);
  doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny);
  doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz);
  doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance);
  if (params.mv_overlap_comm_comp == 1) {
    std::string val("1 (yes)");
    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
  }
  else {
    std::string val("0 (no)");
    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
  }
#ifdef _OPENMP
  doc.get("Global Run Parameters")->add("OpenMP Max Threads:", omp_get_max_threads());
#endif
}
Beispiel #4
0
void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
{
  doc.add("Global Run Parameters","");
  doc.get("Global Run Parameters")->add("dimensions","");
  doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx);
  doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny);
  doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz);
  doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance);
  if (params.mv_overlap_comm_comp == 1) {
    std::string val("1 (yes)");
    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
  }
  else {
    std::string val("0 (no)");
    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
  }
}
Beispiel #5
0
int
driver(const Box& global_box, Box& my_box,
       Parameters& params, YAML_Doc& ydoc)
{
  int global_nx = global_box[0][1];
  int global_ny = global_box[1][1];
  int global_nz = global_box[2][1];

  int numprocs = 1, myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (params.load_imbalance > 0) {
    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
  }

  float largest_imbalance = 0, std_dev = 0;
  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
                                   std_dev, ydoc, true);


  //Create a representation of the mesh:
  //Note that 'simple_mesh_description' is a virtual or conceptual
  //mesh that doesn't actually store mesh data.
#ifdef TIME_IT
  if (myproc==0) {
    std::cout.width(30);
    std::cout << "creating/filling mesh...";
    std::cout.flush();
  }
#endif

  timer_type t_start = mytimer();
  timer_type t0 = mytimer();

  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);

  timer_type mesh_fill = mytimer() - t0;
  timer_type t_total = mytimer() - t_start;

#ifdef TIME_IT
  if (myproc==0) {
    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
  }
#endif

  //next we will generate the matrix structure.

  //Declare matrix object:

#if defined(MINIFE_ELL_MATRIX)
  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
#else
  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
#endif

  MatrixType A;

  timer_type gen_structure;
  RUN_TIMED_FUNCTION("generating matrix structure...",
                     generate_matrix_structure(mesh, A),
                     gen_structure, t_total);

  GlobalOrdinal local_nrows = A.rows.size();
  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;

  Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
  Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);

  //Assemble finite-element sub-matrices and sub-vectors into the global
  //linear system:

  timer_type fe_assembly;
  RUN_TIMED_FUNCTION("assembling FE data...",
                     assemble_FE_data(mesh, A, b, params),
                     fe_assembly, t_total);

  if (myproc == 0) {
    ydoc.add("Matrix structure generation","");
    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
    ydoc.add("FE assembly","");
    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
  }

#ifdef MINIFE_DEBUG
  write_matrix("A_prebc.mtx", A);
  write_vector("b_prebc.vec", b);
#endif

  //Now apply dirichlet boundary-conditions
  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)

  timer_type dirbc_time;
  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);

#ifdef MINIFE_DEBUG
  write_matrix("A.mtx", A);
  write_vector("b.vec", b);
#endif

  //Transform global indices to local, set up communication information:

  timer_type make_local_time;
  RUN_TIMED_FUNCTION("making matrix indices local...",
                     make_local_matrix(A),
                     make_local_time, t_total);

#ifdef MINIFE_DEBUG
  write_matrix("A_local.mtx", A);
  write_vector("b_local.vec", b);
#endif

  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);

  //Prepare to perform conjugate gradient solve:

  LocalOrdinal max_iters = 200;
  LocalOrdinal num_iters = 0;
  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
  magnitude rnorm = 0;
  magnitude tol = std::numeric_limits<magnitude>::epsilon();

  timer_type cg_times[NUM_TIMERS];

  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;

  t_total = mytimer() - t_start;

  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;

  int verify_result = 0;

#if MINIFE_KERNELS != 0
  if (myproc==0) {
    std::cout.width(30);
    std::cout << "Starting kernel timing loops ..." << std::endl;
  }

  max_iters = 500;
  x.coefs[0] = 0.9;
  if (matvec_with_comm_overlap) {
    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
  }
  else {
    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
  }
  num_iters = max_iters;
  std::string title("Kernel timings");
#else
  if (myproc==0) {
    std::cout << "Starting CG solver ... " << std::endl;
  }

  if (matvec_with_comm_overlap) {
#ifdef MINIFE_CSR_MATRIX
    rearrange_matrix_local_external(A);
    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
           num_iters, rnorm, cg_times);
#else
    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
#endif
  }
  else {
    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
           num_iters, rnorm, cg_times);
    if (myproc == 0) {
      std::cout << "Final Resid Norm: " << rnorm << std::endl;
    }

    if (params.verify_solution > 0) {
      double tolerance = 0.06;
      bool verify_whole_domain = false;
  #ifdef MINIFE_DEBUG
      verify_whole_domain = true;
  #endif
      if (myproc == 0) {
        if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
        else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
      }
      verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
    }
  }

#ifdef MINIFE_DEBUG
  write_vector("x.vec", x);
#endif
  std::string title("CG solve");
#endif

  if (myproc == 0) {
    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
    ydoc.add(title,"");
    ydoc.get(title)->add("Iterations",num_iters);
    ydoc.get(title)->add("Final Resid Norm",rnorm);

    GlobalOrdinal global_nrows = global_nx;
    global_nrows *= global_ny*global_nz;

    //flops-per-mv, flops-per-dot, flops-per-waxpy:
    double mv_flops = global_nnz*2.0;
    double dot_flops = global_nrows*2.0;
    double waxpy_flops = global_nrows*3.0;

#if MINIFE_KERNELS == 0
//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
    mv_flops *= (num_iters+1);
    dot_flops *= (2*num_iters);
    waxpy_flops *= (3*num_iters+2);
#else
//if MINIFE_KERNELS then we did one of each operation per iteration.
    mv_flops *= num_iters;
    dot_flops *= num_iters;
    waxpy_flops *= num_iters;
#endif

    double total_flops = mv_flops + dot_flops + waxpy_flops;

    double mv_mflops = -1;
    if (cg_times[MATVEC] > 1.e-4)
      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);

    double dot_mflops = -1;
    if (cg_times[DOT] > 1.e-4)
      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);

    double waxpy_mflops = -1;
    if (cg_times[WAXPY] > 1.e-4)
      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);

    double total_mflops = -1;
    if (cg_times[TOTAL] > 1.e-4)
      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);

    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
    if (waxpy_mflops >= 0)
      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
    else
      ydoc.get(title)->add("WAXPY Mflops","inf");

    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
    ydoc.get(title)->add("DOT Flops",dot_flops);
    if (dot_mflops >= 0)
      ydoc.get(title)->add("DOT Mflops",dot_mflops);
    else
      ydoc.get(title)->add("DOT Mflops","inf");

    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
    ydoc.get(title)->add("MATVEC Flops",mv_flops);
    if (mv_mflops >= 0)
      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
    else
      ydoc.get(title)->add("MATVEC Mflops","inf");

#ifdef MINIFE_FUSED
    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
    if (mv_mflops >= 0)
      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
    else
      ydoc.get(title)->add("MATVECDOT Mflops","inf");
#endif

#if MINIFE_KERNELS == 0
    ydoc.get(title)->add("Total","");
    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
    if (total_mflops >= 0)
      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
    else
      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
#endif
  }

  return verify_result;
}
size_t
compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc)
{
  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
  typedef typename MatrixType::ScalarType Scalar;

  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
  int min_proc = 0, max_proc = 0;

  GlobalOrdinal local_nrows = A.rows.size();

  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
                     max_nrows, max_proc);

  //Gather stats on global, min/max matrix num-nonzeros:

  double local_nnz = A.num_nonzeros();
  double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0;

  get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc,
                     dmax_nnz, max_proc);

  double avg_nrows = global_nrows;
  avg_nrows /= numprocs;
  double avg_nnz = dglobal_nnz;
  avg_nnz /= numprocs;

  double mem_overhead_MB = parallel_memory_overhead_MB(A);

  size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz));
  size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz));
  size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz));
  size_t global_num_rows = global_nrows;

  if (myproc == 0) {
    ydoc.add("Matrix attributes","");
    ydoc.get("Matrix attributes")->add("Global Nrows",global_num_rows);
    ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz);

    //compute how much memory the matrix occupies:
    //num-bytes = sizeof(GlobalOrdinal)*global_nrows   for A.rows
    //          + sizeof(LocalOrdinal)*global_nrows    for A.rows_offsets
    //          + sizeof(GlobalOrdinal)*global_nnz     for A.packed_cols
    //          + sizeof(Scalar)*global_nnz            for A.packed_coefs

    double invGB = 1.0/(1024*1024*1024);
    double memGB = invGB*global_nrows*sizeof(GlobalOrdinal);
    memGB += invGB*global_nrows*sizeof(LocalOrdinal);
    memGB += invGB*global_nnz*sizeof(GlobalOrdinal);
    memGB += invGB*global_nnz*sizeof(Scalar);
    ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB);

    ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB);

    size_t min_num_rows = min_nrows;
    size_t max_num_rows = max_nrows;
    ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_num_rows);
    ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_num_rows);
    ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows);
    ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz);
    ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz);
    ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz);
  }

  return global_nnz;
}