int driver(const Box& global_box, Box& my_box, Parameters& params, YAML_Doc& ydoc) { int global_nx = global_box[0][1]; int global_ny = global_box[1][1]; int global_nz = global_box[2][1]; int numprocs = 1, myproc = 0; #ifdef HAVE_MPI MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myproc); #endif if (params.load_imbalance > 0) { add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc); } float largest_imbalance = 0, std_dev = 0; compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance, std_dev, ydoc, true); //Create a representation of the mesh: //Note that 'simple_mesh_description' is a virtual or conceptual //mesh that doesn't actually store mesh data. #ifdef TIME_IT if (myproc==0) { std::cout.width(30); std::cout << "creating/filling mesh..."; std::cout.flush(); } #endif timer_type t_start = mytimer(); timer_type t0 = mytimer(); simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box); timer_type mesh_fill = mytimer() - t0; timer_type t_total = mytimer() - t_start; #ifdef TIME_IT if (myproc==0) { std::cout << mesh_fill << "s, total time: " << t_total << std::endl; } #endif //next we will generate the matrix structure. //Declare matrix object: #if defined(MINIFE_ELL_MATRIX) typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; #else typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType; #endif MatrixType A; timer_type gen_structure; RUN_TIMED_FUNCTION("generating matrix structure...", generate_matrix_structure(mesh, A), gen_structure, t_total); GlobalOrdinal local_nrows = A.rows.size(); GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1; Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows); Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows); //Assemble finite-element sub-matrices and sub-vectors into the global //linear system: timer_type fe_assembly; RUN_TIMED_FUNCTION("assembling FE data...", assemble_FE_data(mesh, A, b, params), fe_assembly, t_total); if (myproc == 0) { ydoc.add("Matrix structure generation",""); ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure); ydoc.add("FE assembly",""); ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly); } #ifdef MINIFE_DEBUG write_matrix("A_prebc.mtx", A); write_vector("b_prebc.vec", b); #endif //Now apply dirichlet boundary-conditions //(Apply the 0-valued surfaces first, then the 1-valued surface last.) timer_type dirbc_time; RUN_TIMED_FUNCTION("imposing Dirichlet BC...", impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total); RUN_TIMED_FUNCTION("imposing Dirichlet BC...", impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total); #ifdef MINIFE_DEBUG write_matrix("A.mtx", A); write_vector("b.vec", b); #endif //Transform global indices to local, set up communication information: timer_type make_local_time; RUN_TIMED_FUNCTION("making matrix indices local...", make_local_matrix(A), make_local_time, t_total); #ifdef MINIFE_DEBUG write_matrix("A_local.mtx", A); write_vector("b_local.vec", b); #endif size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc); //Prepare to perform conjugate gradient solve: LocalOrdinal max_iters = 200; LocalOrdinal num_iters = 0; typedef typename TypeTraits<Scalar>::magnitude_type magnitude; magnitude rnorm = 0; magnitude tol = std::numeric_limits<magnitude>::epsilon(); timer_type cg_times[NUM_TIMERS]; typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType; t_total = mytimer() - t_start; bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1; int verify_result = 0; #if MINIFE_KERNELS != 0 if (myproc==0) { std::cout.width(30); std::cout << "Starting kernel timing loops ..." << std::endl; } max_iters = 500; x.coefs[0] = 0.9; if (matvec_with_comm_overlap) { time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times); } else { time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times); } num_iters = max_iters; std::string title("Kernel timings"); #else if (myproc==0) { std::cout << "Starting CG solver ... " << std::endl; } if (matvec_with_comm_overlap) { #ifdef MINIFE_CSR_MATRIX rearrange_matrix_local_external(A); cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol, num_iters, rnorm, cg_times); #else std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl; #endif } else { cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol, num_iters, rnorm, cg_times); if (myproc == 0) { std::cout << "Final Resid Norm: " << rnorm << std::endl; } if (params.verify_solution > 0) { double tolerance = 0.06; bool verify_whole_domain = false; #ifdef MINIFE_DEBUG verify_whole_domain = true; #endif if (myproc == 0) { if (verify_whole_domain) std::cout << "verifying solution..." << std::endl; else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl; } verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain); } } #ifdef MINIFE_DEBUG write_vector("x.vec", x); #endif std::string title("CG solve"); #endif if (myproc == 0) { ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name()); ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name()); ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name()); ydoc.add(title,""); ydoc.get(title)->add("Iterations",num_iters); ydoc.get(title)->add("Final Resid Norm",rnorm); GlobalOrdinal global_nrows = global_nx; global_nrows *= global_ny*global_nz; //flops-per-mv, flops-per-dot, flops-per-waxpy: double mv_flops = global_nnz*2.0; double dot_flops = global_nrows*2.0; double waxpy_flops = global_nrows*3.0; #if MINIFE_KERNELS == 0 //if MINIFE_KERNELS == 0 then we did a CG solve, and in that case //there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys. mv_flops *= (num_iters+1); dot_flops *= (2*num_iters); waxpy_flops *= (3*num_iters+2); #else //if MINIFE_KERNELS then we did one of each operation per iteration. mv_flops *= num_iters; dot_flops *= num_iters; waxpy_flops *= num_iters; #endif double total_flops = mv_flops + dot_flops + waxpy_flops; double mv_mflops = -1; if (cg_times[MATVEC] > 1.e-4) mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]); double dot_mflops = -1; if (cg_times[DOT] > 1.e-4) dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]); double waxpy_mflops = -1; if (cg_times[WAXPY] > 1.e-4) waxpy_mflops = 1.e-6 * (waxpy_flops/cg_times[WAXPY]); double total_mflops = -1; if (cg_times[TOTAL] > 1.e-4) total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]); ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]); ydoc.get(title)->add("WAXPY Flops",waxpy_flops); if (waxpy_mflops >= 0) ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops); else ydoc.get(title)->add("WAXPY Mflops","inf"); ydoc.get(title)->add("DOT Time",cg_times[DOT]); ydoc.get(title)->add("DOT Flops",dot_flops); if (dot_mflops >= 0) ydoc.get(title)->add("DOT Mflops",dot_mflops); else ydoc.get(title)->add("DOT Mflops","inf"); ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]); ydoc.get(title)->add("MATVEC Flops",mv_flops); if (mv_mflops >= 0) ydoc.get(title)->add("MATVEC Mflops",mv_mflops); else ydoc.get(title)->add("MATVEC Mflops","inf"); #ifdef MINIFE_FUSED ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]); ydoc.get(title)->add("MATVECDOT Flops",mv_flops); if (mv_mflops >= 0) ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops); else ydoc.get(title)->add("MATVECDOT Mflops","inf"); #endif #if MINIFE_KERNELS == 0 ydoc.get(title)->add("Total",""); ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]); ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops); if (total_mflops >= 0) ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops); else ydoc.get(title)->get("Total")->add("Total CG Mflops","inf"); ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters); #endif } return verify_result; }
int submain ( struct vtx_data **graph, /* data structure for graph */ int nvtxs, /* number of vertices in full graph */ int nedges, /* number of edges in graph */ int using_vwgts, /* are vertex weights being used? */ int using_ewgts, /* are edge weights being used? */ int igeom, /* geometry dimension if using inertial method */ float **coords, /* coordinates of vertices if used */ char *outassignname, /* name of assignment output file */ char *outfilename, /* in which to print output metrics */ int *assignment, /* set number of each vtx (length n) */ double *goal, /* desired sizes for each set */ int architecture, /* 0=> hypercube, d=> d-dimensional mesh */ int ndims_tot, /* total number hypercube dimensions */ int mesh_dims[3], /* extent of mesh in 3 directions */ int global_method, /* global partitioning algorithm */ int local_method, /* local partitioning algorithm */ int rqi_flag, /* use RQI/Symmlq eigensolver? */ int vmax, /* if so, how many vtxs to coarsen down to */ int ndims, /* number of eigenvectors (2^d sets) */ double eigtol, /* tolerance on eigenvectors */ long seed /* for random graph mutations */ ) { extern int ECHO; /* controls output to file or screen */ extern int CHECK_INPUT; /* should I check input for correctness? */ extern int SEQUENCE; /* just generate spectal ordering? */ extern int OUTPUT_ASSIGN; /* print assignment to a file? */ extern int OUTPUT_METRICS; /* controls formatting of output */ extern int PERTURB; /* perturb matrix if quad/octasection? */ extern int NSQRTS; /* number of square roots to precompute */ extern int KL_METRIC; /* KL interset cost: 1=>cuts, 2=>hops */ extern int LANCZOS_TYPE; /* type of Lanczos to use */ extern int REFINE_MAP; /* use greedy strategy to improve mapping? */ extern int REFINE_PARTITION;/* number of calls to pairwise_refine to make */ extern int VERTEX_COVER; /* use matching to reduce vertex separator? */ extern int CONNECTED_DOMAINS; /* force subdomain connectivity at end? */ extern int INTERNAL_VERTICES; /* greedily increase internal vtxs? */ extern int DEBUG_INTERNAL; /* debug code about force_internal? */ extern int DEBUG_REFINE_PART; /* debug code about refine_part? */ extern int DEBUG_REFINE_MAP; /* debug code about refine_map? */ extern int DEBUG_MACH_PARAMS; /* print out computed machine params? */ extern int DEBUG_TRACE; /* trace main execution path */ extern int PRINT_HEADERS; /* print section headings for output? */ extern int TIME_KERNELS; /* benchmark some numerical kernels? */ extern double start_time; /* time code was entered */ extern double total_time; /* (almost) total time spent in code */ extern double check_input_time; /* time spent checking input */ extern double partition_time; /* time spent partitioning graph */ extern double kernel_time; /* time spent benchmarking kernels */ extern double count_time; /* time spent evaluating the answer */ extern double print_assign_time; /* time spent writing output file */ FILE *outfile; /* output file */ struct vtx_data **graph2; /* data structure for graph */ int hop_mtx[MAXSETS][MAXSETS]; /* between-set hop cost for KL */ double *vwsqrt; /* sqrt of vertex weights (length nvtxs+1) */ double time, time1; /* timing variables */ char *graphname, *geomname; /* names of input files */ char *inassignname; /* name of assignment input file */ int old_nsqrts; /* old value of NSQRTS */ int append; /* append output to existing file? */ int nsets; /* number of sets created by each divide */ int nsets_tot; /* total number of sets */ int bits; /* used in computing hops */ int flag; /* return code from check_input */ int old_perturb=0; /* saves original pertubation flag */ int i, j, k; /* loop counters */ double seconds(); void setrandom(long int seed); int check_input(), refine_part(); void connect_enforce(); void setrandom(), makevwsqrt(), balance(), countup(); void force_internal(), sequence(), reflect_input(); void machine_params(), assign_out(), refine_map(); void time_out(), time_kernels(), strout(); if (DEBUG_TRACE > 0) { printf("<Entering submain>\n"); } /* First check all the input for consistency. */ if (architecture == 1) mesh_dims[1] = mesh_dims[2] = 1; else if (architecture == 2) mesh_dims[2] = 1; /* Check for simple special case of 1 processor. */ k = 0; if (architecture == 0) k = 1 << ndims_tot; else if (architecture > 0) k = mesh_dims[0] * mesh_dims[1] * mesh_dims[2]; if (k == 1) { for (i = 1; i <= nvtxs; i++) assignment[i] = 0; if (OUTPUT_ASSIGN > 0 && outassignname != NULL) { time1 = seconds(); assign_out(nvtxs, assignment, k, outassignname); print_assign_time += seconds() - time1; } return(0); } graphname = Graph_File_Name; geomname = Geometry_File_Name; inassignname = Assign_In_File_Name; /* Turn of perturbation if using bisection */ if (ndims == 1) { old_perturb = PERTURB; PERTURB = FALSE; } if (ECHO < 0 && outfilename != NULL) { /* Open output file */ outfile = fopen(outfilename, "r"); if (outfile != NULL) { append = TRUE; fclose(outfile); } else append = FALSE; outfile = fopen(outfilename, "a"); if (append) { fprintf(outfile, "\n------------------------------------------------\n\n"); } } else { outfile = NULL; } Output_File = outfile; if (outfile != NULL && PRINT_HEADERS) { fprintf(outfile, "\n Chaco 2.0\n"); fprintf(outfile, " Sandia National Laboratories\n\n"); } if (CHECK_INPUT) { /* Check the input for inconsistencies. */ time1 = seconds(); flag = check_input(graph, nvtxs, nedges, igeom, coords, graphname, assignment, goal, architecture, ndims_tot, mesh_dims, global_method, local_method, rqi_flag, &vmax, ndims, eigtol); check_input_time += seconds() - time1; if (flag) { strout("ERROR IN INPUT.\n"); return (1); } } if (ECHO != 0) { reflect_input(nvtxs, nedges, igeom, graphname, geomname, inassignname, outassignname, outfilename, architecture, ndims_tot, mesh_dims, global_method, local_method, rqi_flag, vmax, ndims, eigtol, seed, outfile); } if (PRINT_HEADERS) { printf("\n\nStarting to partition ...\n\n"); if (Output_File != NULL ) { fprintf(Output_File, "\n\nStarting to partition ... (residual, warning and error messages only)\n\n"); } } time = seconds(); /* Perform some one-time initializations. */ setrandom(seed); machine_params(&DOUBLE_EPSILON, &DOUBLE_MAX); if (DEBUG_MACH_PARAMS > 0) { printf("Machine parameters:\n"); printf(" DOUBLE_EPSILON = %e\n", DOUBLE_EPSILON); printf(" DOUBLE_MAX = %e\n", DOUBLE_MAX); } nsets = (1 << ndims); old_nsqrts = NSQRTS; if (nvtxs < NSQRTS && !using_vwgts) { NSQRTS = nvtxs; } SQRTS = smalloc_ret((NSQRTS + 1) * sizeof(double)); if (SQRTS == NULL) { strout("ERROR: No space to allocate sqrts\n"); return(1); } for (i = 1; i <= NSQRTS; i++) SQRTS[i] = sqrt((double) i); if (using_vwgts && (global_method == 1 || global_method == 2)) { vwsqrt = smalloc_ret((nvtxs + 1) * sizeof(double)); if (vwsqrt == NULL) { strout("ERROR: No space to allocate vwsqrt\n"); sfree(SQRTS); NSQRTS = old_nsqrts; return(1); } makevwsqrt(vwsqrt, graph, nvtxs); } else vwsqrt = NULL; if (TIME_KERNELS) { time1 = seconds(); time_kernels(graph, nvtxs, vwsqrt); kernel_time += seconds() - time1; } if (SEQUENCE) { sequence(graph, nvtxs, nedges, using_ewgts, vwsqrt, LANCZOS_TYPE, rqi_flag, vmax, eigtol); goto End_Label; } /* Initialize cost function for KL-spiff */ if (global_method == 1 || local_method == 1) { for (i = 0; i < nsets; i++) { hop_mtx[i][i] = 0; for (j = 0; j < i; j++) { if (KL_METRIC == 2) { /* Count hypercube hops */ hop_mtx[i][j] = 0; bits = i ^ j; while (bits) { if (bits & 1) { ++hop_mtx[i][j]; } bits >>= 1; } } else if (KL_METRIC == 1) { /* Count cut edges */ hop_mtx[i][j] = 1; } hop_mtx[j][i] = hop_mtx[i][j]; } }