Example #1
0
int
driver(const Box& global_box, Box& my_box,
       Parameters& params, YAML_Doc& ydoc)
{
  int global_nx = global_box[0][1];
  int global_ny = global_box[1][1];
  int global_nz = global_box[2][1];

  int numprocs = 1, myproc = 0;
#ifdef HAVE_MPI
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
#endif

  if (params.load_imbalance > 0) {
    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
  }

  float largest_imbalance = 0, std_dev = 0;
  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
                                   std_dev, ydoc, true);


  //Create a representation of the mesh:
  //Note that 'simple_mesh_description' is a virtual or conceptual
  //mesh that doesn't actually store mesh data.
#ifdef TIME_IT
  if (myproc==0) {
    std::cout.width(30);
    std::cout << "creating/filling mesh...";
    std::cout.flush();
  }
#endif

  timer_type t_start = mytimer();
  timer_type t0 = mytimer();

  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);

  timer_type mesh_fill = mytimer() - t0;
  timer_type t_total = mytimer() - t_start;

#ifdef TIME_IT
  if (myproc==0) {
    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
  }
#endif

  //next we will generate the matrix structure.

  //Declare matrix object:

#if defined(MINIFE_ELL_MATRIX)
  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
#else
  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal> MatrixType;
#endif

  MatrixType A;

  timer_type gen_structure;
  RUN_TIMED_FUNCTION("generating matrix structure...",
                     generate_matrix_structure(mesh, A),
                     gen_structure, t_total);

  GlobalOrdinal local_nrows = A.rows.size();
  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;

  Vector<Scalar,LocalOrdinal,GlobalOrdinal> b(my_first_row, local_nrows);
  Vector<Scalar,LocalOrdinal,GlobalOrdinal> x(my_first_row, local_nrows);

  //Assemble finite-element sub-matrices and sub-vectors into the global
  //linear system:

  timer_type fe_assembly;
  RUN_TIMED_FUNCTION("assembling FE data...",
                     assemble_FE_data(mesh, A, b, params),
                     fe_assembly, t_total);

  if (myproc == 0) {
    ydoc.add("Matrix structure generation","");
    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
    ydoc.add("FE assembly","");
    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
  }

#ifdef MINIFE_DEBUG
  write_matrix("A_prebc.mtx", A);
  write_vector("b_prebc.vec", b);
#endif

  //Now apply dirichlet boundary-conditions
  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)

  timer_type dirbc_time;
  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);

#ifdef MINIFE_DEBUG
  write_matrix("A.mtx", A);
  write_vector("b.vec", b);
#endif

  //Transform global indices to local, set up communication information:

  timer_type make_local_time;
  RUN_TIMED_FUNCTION("making matrix indices local...",
                     make_local_matrix(A),
                     make_local_time, t_total);

#ifdef MINIFE_DEBUG
  write_matrix("A_local.mtx", A);
  write_vector("b_local.vec", b);
#endif

  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);

  //Prepare to perform conjugate gradient solve:

  LocalOrdinal max_iters = 200;
  LocalOrdinal num_iters = 0;
  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
  magnitude rnorm = 0;
  magnitude tol = std::numeric_limits<magnitude>::epsilon();

  timer_type cg_times[NUM_TIMERS];

  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal> VectorType;

  t_total = mytimer() - t_start;

  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;

  int verify_result = 0;

#if MINIFE_KERNELS != 0
  if (myproc==0) {
    std::cout.width(30);
    std::cout << "Starting kernel timing loops ..." << std::endl;
  }

  max_iters = 500;
  x.coefs[0] = 0.9;
  if (matvec_with_comm_overlap) {
    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
  }
  else {
    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
  }
  num_iters = max_iters;
  std::string title("Kernel timings");
#else
  if (myproc==0) {
    std::cout << "Starting CG solver ... " << std::endl;
  }

  if (matvec_with_comm_overlap) {
#ifdef MINIFE_CSR_MATRIX
    rearrange_matrix_local_external(A);
    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
           num_iters, rnorm, cg_times);
#else
    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
#endif
  }
  else {
    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
           num_iters, rnorm, cg_times);
    if (myproc == 0) {
      std::cout << "Final Resid Norm: " << rnorm << std::endl;
    }

    if (params.verify_solution > 0) {
      double tolerance = 0.06;
      bool verify_whole_domain = false;
  #ifdef MINIFE_DEBUG
      verify_whole_domain = true;
  #endif
      if (myproc == 0) {
        if (verify_whole_domain) std::cout << "verifying solution..." << std::endl;
        else std::cout << "verifying solution at ~ (0.5, 0.5, 0.5) ..." << std::endl;
      }
      verify_result = verify_solution(mesh, x, tolerance, verify_whole_domain);
    }
  }

#ifdef MINIFE_DEBUG
  write_vector("x.vec", x);
#endif
  std::string title("CG solve");
#endif

  if (myproc == 0) {
    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
    ydoc.add(title,"");
    ydoc.get(title)->add("Iterations",num_iters);
    ydoc.get(title)->add("Final Resid Norm",rnorm);

    GlobalOrdinal global_nrows = global_nx;
    global_nrows *= global_ny*global_nz;

    //flops-per-mv, flops-per-dot, flops-per-waxpy:
    double mv_flops = global_nnz*2.0;
    double dot_flops = global_nrows*2.0;
    double waxpy_flops = global_nrows*3.0;

#if MINIFE_KERNELS == 0
//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
    mv_flops *= (num_iters+1);
    dot_flops *= (2*num_iters);
    waxpy_flops *= (3*num_iters+2);
#else
//if MINIFE_KERNELS then we did one of each operation per iteration.
    mv_flops *= num_iters;
    dot_flops *= num_iters;
    waxpy_flops *= num_iters;
#endif

    double total_flops = mv_flops + dot_flops + waxpy_flops;

    double mv_mflops = -1;
    if (cg_times[MATVEC] > 1.e-4)
      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);

    double dot_mflops = -1;
    if (cg_times[DOT] > 1.e-4)
      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);

    double waxpy_mflops = -1;
    if (cg_times[WAXPY] > 1.e-4)
      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);

    double total_mflops = -1;
    if (cg_times[TOTAL] > 1.e-4)
      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);

    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
    if (waxpy_mflops >= 0)
      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
    else
      ydoc.get(title)->add("WAXPY Mflops","inf");

    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
    ydoc.get(title)->add("DOT Flops",dot_flops);
    if (dot_mflops >= 0)
      ydoc.get(title)->add("DOT Mflops",dot_mflops);
    else
      ydoc.get(title)->add("DOT Mflops","inf");

    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
    ydoc.get(title)->add("MATVEC Flops",mv_flops);
    if (mv_mflops >= 0)
      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
    else
      ydoc.get(title)->add("MATVEC Mflops","inf");

#ifdef MINIFE_FUSED
    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
    if (mv_mflops >= 0)
      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
    else
      ydoc.get(title)->add("MATVECDOT Mflops","inf");
#endif

#if MINIFE_KERNELS == 0
    ydoc.get(title)->add("Total","");
    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
    if (total_mflops >= 0)
      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
    else
      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
#endif
  }

  return verify_result;
}
Example #2
0
int 
submain (
    struct vtx_data **graph,	/* data structure for graph */
    int nvtxs,		/* number of vertices in full graph */
    int nedges,		/* number of edges in graph */
    int using_vwgts,		/* are vertex weights being used? */
    int using_ewgts,		/* are edge weights being used? */
    int igeom,		/* geometry dimension if using inertial method */
    float **coords,		/* coordinates of vertices if used */
    char *outassignname,	/* name of assignment output file */
    char *outfilename,		/* in which to print output metrics */
    int *assignment,		/* set number of each vtx (length n) */
    double *goal,			/* desired sizes for each set */
    int architecture,		/* 0=> hypercube, d=> d-dimensional mesh */
    int ndims_tot,		/* total number hypercube dimensions */
    int mesh_dims[3],		/* extent of mesh in 3 directions */
    int global_method,	/* global partitioning algorithm */
    int local_method,		/* local partitioning algorithm */
    int rqi_flag,		/* use RQI/Symmlq eigensolver? */
    int vmax,			/* if so, how many vtxs to coarsen down to */
    int ndims,		/* number of eigenvectors (2^d sets) */
    double eigtol,		/* tolerance on eigenvectors */
    long seed			/* for random graph mutations */
)
{
    extern int ECHO;		/* controls output to file or screen */
    extern int CHECK_INPUT;	/* should I check input for correctness? */
    extern int SEQUENCE;	/* just generate spectal ordering? */
    extern int OUTPUT_ASSIGN;	/* print assignment to a file? */
    extern int OUTPUT_METRICS;	/* controls formatting of output */
    extern int PERTURB;		/* perturb matrix if quad/octasection? */
    extern int NSQRTS;		/* number of square roots to precompute */
    extern int KL_METRIC;	/* KL interset cost: 1=>cuts, 2=>hops */
    extern int LANCZOS_TYPE;	/* type of Lanczos to use */
    extern int REFINE_MAP;	/* use greedy strategy to improve mapping? */
    extern int REFINE_PARTITION;/* number of calls to pairwise_refine to make */
    extern int VERTEX_COVER;	/* use matching to reduce vertex separator? */
    extern int CONNECTED_DOMAINS;	/* force subdomain connectivity at end? */
    extern int INTERNAL_VERTICES;	/* greedily increase internal vtxs? */
    extern int DEBUG_INTERNAL;		/* debug code about force_internal? */
    extern int DEBUG_REFINE_PART;	/* debug code about refine_part? */
    extern int DEBUG_REFINE_MAP;	/* debug code about refine_map? */
    extern int DEBUG_MACH_PARAMS;	/* print out computed machine params? */
    extern int DEBUG_TRACE;	/* trace main execution path */
    extern int PRINT_HEADERS;	/* print section headings for output? */
    extern int TIME_KERNELS;	/* benchmark some numerical kernels? */
    extern double start_time;	/* time code was entered */
    extern double total_time;	/* (almost) total time spent in code */
    extern double check_input_time;	/* time spent checking input */
    extern double partition_time;	/* time spent partitioning graph */
    extern double kernel_time;	/* time spent benchmarking kernels */
    extern double count_time;	/* time spent evaluating the answer */
    extern double print_assign_time;	/* time spent writing output file */
    FILE     *outfile;		/* output file */
    struct vtx_data **graph2;	/* data structure for graph */
    int     hop_mtx[MAXSETS][MAXSETS];	/* between-set hop cost for KL */
    double   *vwsqrt;		/* sqrt of vertex weights (length nvtxs+1) */
    double    time, time1;	/* timing variables */
    char     *graphname, *geomname;	/* names of input files */
    char     *inassignname;	/* name of assignment input file */
    int       old_nsqrts;	/* old value of NSQRTS */
    int       append;		/* append output to existing file? */
    int       nsets;		/* number of sets created by each divide */
    int       nsets_tot;	/* total number of sets */
    int       bits;		/* used in computing hops */
    int       flag;		/* return code from check_input */
    int       old_perturb=0;	/* saves original pertubation flag */
    int       i, j, k;		/* loop counters */
    double    seconds();
    void      setrandom(long int seed);
    int       check_input(), refine_part();
    void      connect_enforce();
    void      setrandom(), makevwsqrt(), balance(), countup();
    void      force_internal(), sequence(), reflect_input();
    void      machine_params(), assign_out(), refine_map();
    void      time_out(), time_kernels(), strout();

    if (DEBUG_TRACE > 0) {
	printf("<Entering submain>\n");
    }

    /* First check all the input for consistency. */

    if (architecture == 1)
	mesh_dims[1] = mesh_dims[2] = 1;
    else if (architecture == 2)
	mesh_dims[2] = 1;
    
    /* Check for simple special case of 1 processor. */
    k = 0;
    if (architecture == 0)
      k = 1 << ndims_tot;
    else if (architecture > 0)
      k = mesh_dims[0] * mesh_dims[1] * mesh_dims[2];

    if (k == 1) {
	for (i = 1; i <= nvtxs; i++) assignment[i] = 0;

        if (OUTPUT_ASSIGN > 0 && outassignname != NULL) {
	    time1 = seconds();
	    assign_out(nvtxs, assignment, k, outassignname);
	    print_assign_time += seconds() - time1;
        }
	return(0);
    }

    graphname = Graph_File_Name;
    geomname = Geometry_File_Name;
    inassignname = Assign_In_File_Name;

    /* Turn of perturbation if using bisection */
    if (ndims == 1) {
        old_perturb = PERTURB;
	PERTURB = FALSE;
    }

    if (ECHO < 0 && outfilename != NULL) { /* Open output file */
	outfile = fopen(outfilename, "r");
	if (outfile != NULL) {
	    append = TRUE;
	    fclose(outfile);
	}
	else append = FALSE;
	outfile = fopen(outfilename, "a");
	if (append) {
	    fprintf(outfile, "\n------------------------------------------------\n\n");
	}
    }
    else {
	outfile = NULL;
    }

    Output_File = outfile;

    if (outfile != NULL && PRINT_HEADERS) {
        fprintf(outfile, "\n                    Chaco 2.0\n");
        fprintf(outfile, "          Sandia National Laboratories\n\n");
    }

    if (CHECK_INPUT) {		/* Check the input for inconsistencies. */
	time1 = seconds();

	flag = check_input(graph, nvtxs, nedges, igeom, coords,
			   graphname, assignment, goal,
			   architecture, ndims_tot, mesh_dims,
			   global_method, local_method, rqi_flag, &vmax, ndims,
			   eigtol);

	check_input_time += seconds() - time1;

	if (flag) {
	    strout("ERROR IN INPUT.\n");
	    return (1);
	}
    }

    if (ECHO != 0) {
	reflect_input(nvtxs, nedges, igeom, graphname, geomname,
		      inassignname, outassignname, outfilename,
		      architecture, ndims_tot, mesh_dims,
		      global_method, local_method, rqi_flag, vmax, ndims,
		      eigtol, seed, outfile);
    }

    if (PRINT_HEADERS) {
        printf("\n\nStarting to partition ...\n\n");
	if (Output_File != NULL ) {
            fprintf(Output_File,
	    "\n\nStarting to partition ... (residual, warning and error messages only)\n\n");
	}
    }

    time = seconds();

    /* Perform some one-time initializations. */
    setrandom(seed);
    machine_params(&DOUBLE_EPSILON, &DOUBLE_MAX);

    if (DEBUG_MACH_PARAMS > 0) {
	printf("Machine parameters:\n");
	printf("  DOUBLE_EPSILON = %e\n", DOUBLE_EPSILON);
	printf("  DOUBLE_MAX = %e\n", DOUBLE_MAX);
    }

    nsets = (1 << ndims);

    old_nsqrts = NSQRTS;
    if (nvtxs < NSQRTS && !using_vwgts) {
	NSQRTS = nvtxs;
    }
    SQRTS = smalloc_ret((NSQRTS + 1) * sizeof(double));
    if (SQRTS == NULL) {
	strout("ERROR: No space to allocate sqrts\n");
	return(1);
    }
    for (i = 1; i <= NSQRTS; i++)
	SQRTS[i] = sqrt((double) i);

    if (using_vwgts && (global_method == 1 || global_method == 2)) {
	vwsqrt = smalloc_ret((nvtxs + 1) * sizeof(double));
	if (vwsqrt == NULL) {
	    strout("ERROR: No space to allocate vwsqrt\n");
	    sfree(SQRTS);
	    NSQRTS = old_nsqrts;
	    return(1);
        }
	makevwsqrt(vwsqrt, graph, nvtxs);
    }
    else
	vwsqrt = NULL;

    if (TIME_KERNELS) {
	time1 = seconds();
	time_kernels(graph, nvtxs, vwsqrt);
	kernel_time += seconds() - time1;
    }

    if (SEQUENCE) {
	sequence(graph, nvtxs, nedges, using_ewgts, vwsqrt,
		 LANCZOS_TYPE, rqi_flag, vmax, eigtol);
	goto End_Label;
    }

    /* Initialize cost function for KL-spiff */
    if (global_method == 1 || local_method == 1) {
	for (i = 0; i < nsets; i++) {
	    hop_mtx[i][i] = 0;
	    for (j = 0; j < i; j++) {
		if (KL_METRIC == 2) {	/* Count hypercube hops */
		    hop_mtx[i][j] = 0;
		    bits = i ^ j;
		    while (bits) {
			if (bits & 1) {
			    ++hop_mtx[i][j];
			}
			bits >>= 1;
		    }
		}
		else if (KL_METRIC == 1) {	/* Count cut edges */
		    hop_mtx[i][j] = 1;
		}
		hop_mtx[j][i] = hop_mtx[i][j];
	    }
	}