void compute_imbalance(const int global_box[][2], const int local_box[][2], float& largest_imbalance, float& std_dev, YAML_Doc& doc, bool record_in_doc) { int numprocs = 1, myproc = 0; #ifdef HAVE_MPI MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box); GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0; int min_proc = myproc, max_proc = myproc; get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc, max_nrows, max_proc); float avg_nrows = global_nrows; avg_nrows /= numprocs; //largest_imbalance will be the difference between the min (or max) //rows-per-processor and avg_nrows, represented as a percentage: largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows); float tmp = percentage_difference<float>(max_nrows, avg_nrows); if (tmp > largest_imbalance) largest_imbalance = tmp; std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows); if (myproc == 0 && record_in_doc) { doc.add("Rows-per-proc Load Imbalance",""); doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance); doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev); } }
void add_timestring_to_yaml(YAML_Doc& doc) { std::time_t rawtime; struct tm * timeinfo; std::time(&rawtime); timeinfo = std::localtime(&rawtime); std::ostringstream osstr; osstr.fill('0'); osstr << timeinfo->tm_year+1900 << "-"; osstr.width(2); osstr << timeinfo->tm_mon+1 << "-"; osstr.width(2); osstr << timeinfo->tm_mday << ", "; osstr.width(2); osstr << timeinfo->tm_hour << "-"; osstr.width(2); osstr << timeinfo->tm_min << "-"; osstr.width(2); osstr << timeinfo->tm_sec; std::string timestring = osstr.str(); doc.add("Run Date/Time",timestring); }
void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params) { doc.add("Global Run Parameters",""); doc.get("Global Run Parameters")->add("dimensions",""); doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx); doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny); doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz); doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance); if (params.mv_overlap_comm_comp == 1) { std::string val("1 (yes)"); doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val); } else { std::string val("0 (no)"); doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val); } #ifdef _OPENMP doc.get("Global Run Parameters")->add("OpenMP Max Threads:", omp_get_max_threads()); #endif }
void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params) { doc.add("Global Run Parameters",""); doc.get("Global Run Parameters")->add("dimensions",""); doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx); doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny); doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz); doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance); if (params.mv_overlap_comm_comp == 1) { std::string val("1 (yes)"); doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val); } else { std::string val("0 (no)"); doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val); } }
void add_configuration_to_yaml(YAML_Doc& doc, int numprocs) { doc.get("Global Run Parameters")->add("number of processors", numprocs); doc.add("Platform",""); doc.get("Platform")->add("hostname",MINIFE_HOSTNAME); doc.get("Platform")->add("kernel name",MINIFE_KERNEL_NAME); doc.get("Platform")->add("kernel release",MINIFE_KERNEL_RELEASE); doc.get("Platform")->add("processor",MINIFE_PROCESSOR); doc.add("Build",""); doc.get("Build")->add("CXX",MINIFE_CXX); #if MINIFE_INFO != 0 doc.get("Build")->add("compiler version",MINIFE_CXX_VERSION); #endif doc.get("Build")->add("CXXFLAGS",MINIFE_CXXFLAGS); std::string using_mpi("no"); #ifdef HAVE_MPI using_mpi = "yes"; #endif doc.get("Build")->add("using MPI",using_mpi); }
int driver(const Box& global_box, Box& my_box, Parameters& params, YAML_Doc& ydoc) { int global_nx = global_box[0][1]; int global_ny = global_box[1][1]; int global_nz = global_box[2][1]; int numprocs = 1, myproc = 0; #ifdef HAVE_MPI MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (params.load_imbalance > 0) { add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc); } float largest_imbalance = 0, std_dev = 0; compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance, std_dev, ydoc, true); //Create a representation of the mesh: //Note that 'simple_mesh_description' is a virtual or conceptual //mesh that doesn't actually store mesh data. #ifdef TIME_IT if (myproc==0) { std::cout.width(30); std::cout << "creating/filling mesh..."; std::cout.flush(); } #endif timer_type t_start = mytimer(); timer_type t0 = mytimer(); simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box); timer_type mesh_fill = mytimer() - t0; timer_type t_total = mytimer() - t_start; #ifdef TIME_IT if (myproc==0) { std::cout << mesh_fill << "s, total time: " << t_total << std::endl; } #endif //next we will generate the matrix structure. //Declare matrix object: #if defined(MINIFE_ELL_MATRIX) typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; #else typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; #endif MatrixType A; timer_type gen_structure; RUN_TIMED_FUNCTION("generating matrix structure...", generate_matrix_structure(mesh, A), gen_structure, t_total); GlobalOrdinal local_nrows = A.rows.size(); GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1; Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows); Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows); //Assemble finite-element sub-matrices and sub-vectors into the global //linear system: timer_type fe_assembly; RUN_TIMED_FUNCTION("assembling FE data...", assemble_FE_data(mesh, A, b, params), fe_assembly, t_total); if (myproc == 0) { ydoc.add("Matrix structure generation",""); ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure); ydoc.add("FE assembly",""); ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly); } #ifdef MINIFE_DEBUG write_matrix("A_prebc.mtx", A); write_vector("b_prebc.vec", b); #endif //Now apply dirichlet boundary-conditions //(Apply the 0-valued surfaces first, then the 1-valued surface last.) timer_type dirbc_time; RUN_TIMED_FUNCTION("imposing Dirichlet BC...", impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total); RUN_TIMED_FUNCTION("imposing Dirichlet BC...", impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total); #ifdef MINIFE_DEBUG write_matrix("A.mtx", A); write_vector("b.vec", b); #endif //Transform global indices to local, set up communication information: timer_type make_local_time; RUN_TIMED_FUNCTION("making matrix indices local...", make_local_matrix(A), make_local_time, t_total); #ifdef MINIFE_DEBUG write_matrix("A_local.mtx", A); write_vector("b_local.vec", b); #endif size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc); //Prepare to perform conjugate gradient solve: LocalOrdinal max_iters = 200; LocalOrdinal num_iters = 0; typedef typename TypeTraits<Scalar>::magnitude_type magnitude; magnitude rnorm = 0; magnitude tol = std::numeric_limits<magnitude>::epsilon(); timer_type cg_times[NUM_TIMERS]; typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType; t_total = mytimer() - t_start; bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1; int verify_result = 0; #if MINIFE_KERNELS != 0 if (myproc==0) { std::cout.width(30); std::cout << "Starting kernel timing loops ..." << std::endl; } max_iters = 500; x.coefs[0] = 0.9; if (matvec_with_comm_overlap) { time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times); } else { time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times); } num_iters = max_iters; std::string title("Kernel timings"); #else if (myproc==0) { std::cout << "Starting CG solver ... " << std::endl; } if (matvec_with_comm_overlap) { #ifdef MINIFE_CSR_MATRIX rearrange_matrix_local_external(A); cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol, num_iters, rnorm, cg_times); #else std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl; #endif } else { cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol, num_iters, rnorm, cg_times); if (myproc == 0) { std::cout << "Final Resid Norm: " << rnorm << std::endl; } if (params.verify_solution > 0) { double tolerance = 0.06; bool verify_whole_domain = false; #ifdef MINIFE_DEBUG verify_whole_domain = true; #endif if (myproc == 0) { if (verify_whole_domain) std::cout << "verifying solution..." << std::endl; else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl; } verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain); } } #ifdef MINIFE_DEBUG write_vector("x.vec", x); #endif std::string title("CG solve"); #endif if (myproc == 0) { ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name()); ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name()); ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name()); ydoc.add(title,""); ydoc.get(title)->add("Iterations",num_iters); ydoc.get(title)->add("Final Resid Norm",rnorm); GlobalOrdinal global_nrows = global_nx; global_nrows *= global_ny*global_nz; //flops-per-mv, flops-per-dot, flops-per-waxpy: double mv_flops = global_nnz*2.0; double dot_flops = global_nrows*2.0; double waxpy_flops = global_nrows*3.0; #if MINIFE_KERNELS == 0 //if MINIFE_KERNELS == 0 then we did a CG solve, and in that case //there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys. mv_flops *= (num_iters+1); dot_flops *= (2*num_iters); waxpy_flops *= (3*num_iters+2); #else //if MINIFE_KERNELS then we did one of each operation per iteration. mv_flops *= num_iters; dot_flops *= num_iters; waxpy_flops *= num_iters; #endif double total_flops = mv_flops + dot_flops + waxpy_flops; double mv_mflops = -1; if (cg_times[MATVEC] > 1.e-4) mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]); double dot_mflops = -1; if (cg_times[DOT] > 1.e-4) dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]); double waxpy_mflops = -1; if (cg_times[WAXPY] > 1.e-4) waxpy_mflops = 1.e-6 * (waxpy_flops/cg_times[WAXPY]); double total_mflops = -1; if (cg_times[TOTAL] > 1.e-4) total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]); ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]); ydoc.get(title)->add("WAXPY Flops",waxpy_flops); if (waxpy_mflops >= 0) ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops); else ydoc.get(title)->add("WAXPY Mflops","inf"); ydoc.get(title)->add("DOT Time",cg_times[DOT]); ydoc.get(title)->add("DOT Flops",dot_flops); if (dot_mflops >= 0) ydoc.get(title)->add("DOT Mflops",dot_mflops); else ydoc.get(title)->add("DOT Mflops","inf"); ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]); ydoc.get(title)->add("MATVEC Flops",mv_flops); if (mv_mflops >= 0) ydoc.get(title)->add("MATVEC Mflops",mv_mflops); else ydoc.get(title)->add("MATVEC Mflops","inf"); #ifdef MINIFE_FUSED ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]); ydoc.get(title)->add("MATVECDOT Flops",mv_flops); if (mv_mflops >= 0) ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops); else ydoc.get(title)->add("MATVECDOT Mflops","inf"); #endif #if MINIFE_KERNELS == 0 ydoc.get(title)->add("Total",""); ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]); ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops); if (total_mflops >= 0) ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops); else ydoc.get(title)->get("Total")->add("Total CG Mflops","inf"); ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters); #endif } return verify_result; }
size_t compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc) { typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal; typedef typename MatrixType::LocalOrdinalType LocalOrdinal; typedef typename MatrixType::ScalarType Scalar; GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0; int min_proc = 0, max_proc = 0; GlobalOrdinal local_nrows = A.rows.size(); get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc, max_nrows, max_proc); //Gather stats on global, min/max matrix num-nonzeros: double local_nnz = A.num_nonzeros(); double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0; get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc, dmax_nnz, max_proc); double avg_nrows = global_nrows; avg_nrows /= numprocs; double avg_nnz = dglobal_nnz; avg_nnz /= numprocs; double mem_overhead_MB = parallel_memory_overhead_MB(A); size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz)); size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz)); size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz)); size_t global_num_rows = global_nrows; if (myproc == 0) { ydoc.add("Matrix attributes",""); ydoc.get("Matrix attributes")->add("Global Nrows",global_num_rows); ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz); //compute how much memory the matrix occupies: //num-bytes = sizeof(GlobalOrdinal)*global_nrows for A.rows // + sizeof(LocalOrdinal)*global_nrows for A.rows_offsets // + sizeof(GlobalOrdinal)*global_nnz for A.packed_cols // + sizeof(Scalar)*global_nnz for A.packed_coefs double invGB = 1.0/(1024*1024*1024); double memGB = invGB*global_nrows*sizeof(GlobalOrdinal); memGB += invGB*global_nrows*sizeof(LocalOrdinal); memGB += invGB*global_nnz*sizeof(GlobalOrdinal); memGB += invGB*global_nnz*sizeof(Scalar); ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB); ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB); size_t min_num_rows = min_nrows; size_t max_num_rows = max_nrows; ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_num_rows); ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_num_rows); ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows); ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz); ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz); ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz); } return global_nnz; }