int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("Tacho::DenseMatrixBase examples on Pthreads execution space.\n");

  int nthreads = 0;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int numa = 0;
  clp.setOption("numa", &numa, "Number of numa node");

  int core_per_numa = 0;
  clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  std::string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int treecut = 0;
  clp.setOption("treecut", &treecut, "Level to cut tree from bottom");

  int prunecut = 0;
  clp.setOption("prunecut", &prunecut, "Level to prune tree from bottom");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

  int r_val = 0;
  {
    exec_space::initialize(nthreads, numa, core_per_numa);

#if (defined(HAVE_SHYLUTACHO_SCOTCH) && (defined(HAVE_SHYLUTACHO_CHOLMOD) \
        || defined(HAVE_SHYLUTACHO_AMESOS)))
    r_val = exampleGraphTools<exec_space>
      (file_input, treecut, prunecut, verbose);
#else
    r_val = -1;
    std::cout << "Scotch or Cholmod is NOT configured in Trilinos" << std::endl;
#endif

    exec_space::finalize();
  }
  
  return r_val;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("Tacho::DenseMatrixBase examples on Pthreads execution space.\n");

  int nthreads = 0;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int numa = 0;
  clp.setOption("numa", &numa, "Number of numa node");

  int core_per_numa = 0;
  clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  int mmin = 1000;
  clp.setOption("mmin", &mmin, "C(mmin,mmin)");

  int mmax = 8000;
  clp.setOption("mmax", &mmax, "C(mmax,mmax)");

  int minc = 1000;
  clp.setOption("minc", &minc, "Increment of m");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

  int r_val = 0;
  {
    exec_space::initialize();
    host_space::initialize(nthreads, numa, core_per_numa);

    r_val = exampleDenseMatrixBase<exec_space>
      (mmin, mmax, minc, 
       verbose);
    
    exec_space::finalize();
    host_space::finalize();
  }
  
  return r_val;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("This example interface of solver Kokkos::Threads execution space.\n");

  int nthreads = 1;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int numa = 0;
  clp.setOption("numa", &numa, "Number of numa node");

  int core_per_numa = 0;
  clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int nrhs = 1;
  clp.setOption("nrhs", &nrhs, "Numer of right hand side");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

  int r_val = 0;
  {
    exec_space::initialize(nthreads, numa, core_per_numa);
    exec_space::print_configuration(cout, true);

    r_val = exampleCholDirectSolver
      <value_type,ordinal_type,size_type,exec_space,void>
      (file_input,
       nrhs,
       nthreads,
       verbose);

    exec_space::finalize();
  }

  return r_val;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("This example program demonstrates TriSolveUnblocked algorithm on Kokkos::Serial execution space.\n");

  int nthreads = 1;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int max_task_dependence = 10;
  clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence");

  int team_size = 1;
  clp.setOption("team-size", &team_size, "Team size");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int nrhs = 1; 
  clp.setOption("nrhs", &nrhs, "Number of right hand side"); 

  int nb = nrhs; 
  clp.setOption("nb", &nb, "Blocksize of right hand side"); 

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;
  
  int r_val = 0;
  {
    exec_space::initialize(nthreads);
    exec_space::print_configuration(cout, true);

    r_val = exampleTriSolveByBlocks
      <value_type,ordinal_type,size_type,exec_space,void>
      (file_input, nrhs, nb, nthreads, max_task_dependence, team_size, verbose);
    
    exec_space::finalize();
  }

  return r_val;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("This example program measure the performance of task data parallelism (barrier) on Kokkos::Threads execution space.\n");

  int nthreads = 0;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int numa = 0;
  clp.setOption("numa", &numa, "Number of numa node");

  int core_per_numa = 0;
  clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node");

  int league_size = 1;
  clp.setOption("league-size", &league_size, "League size");

  int team_size = 1;
  clp.setOption("team-size", &team_size, "Team size");

  int ntasks = 100;
  clp.setOption("ntasks", &ntasks, "Number of tasks to be spawned");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

  int r_val = 0;
  {
    exec_space::initialize(nthreads, numa, core_per_numa);
    exec_space::print_configuration(cout, true);

    r_val = exampleKokkosDataData<exec_space,value_type>((ntasks > MAXTASKS ? MAXTASKS : ntasks), league_size, team_size, verbose);

    exec_space::finalize();
  }

  return r_val;
}
Example #6
0
int main(int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("Intrepid2::DynRankView_PerfTest01.\n");

  int nworkset = 8;
  clp.setOption("nworkset", &nworkset, "# of worksets");

  int C = 4096;
  clp.setOption("C", &C, "# of Cells in a workset");

  int order = 2;
  clp.setOption("order", &order, "cubature order");

  bool verbose = true;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

  Kokkos::initialize();

  if (verbose) 
    std::cout << "Testing datatype double\n";

  const int r_val_double = Intrepid2::Test::ComputeBasis_HGRAD
    <double,Kokkos::Cuda>(nworkset,
                            C,
                            order,
                            verbose);
  return r_val_double;
}
Example #7
0
int main(int argc, char *argv[]) {   

// Initialize MPI
#ifdef HAVE_MPI
  MPI_Init(&argc,&argv);
#endif

  // Create a communicator for Epetra objects
  Teuchos::RCP<const Epetra_Comm> globalComm;
#ifdef HAVE_MPI
  globalComm = Teuchos::rcp(new Epetra_MpiComm(MPI_COMM_WORLD));
#else
  globalComm = Teuchos::rcp(new Epetra_SerialComm);
#endif
  int MyPID = globalComm->MyPID();

  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This example runs a variety of stochastic Galerkin solvers.\n");

    int n = 32;
    CLP.setOption("num_mesh", &n, "Number of mesh points in each direction");

    bool symmetric = false;
    CLP.setOption("symmetric", "unsymmetric", &symmetric, 
		  "Symmetric discretization");

    int num_spatial_procs = -1;
    CLP.setOption("num_spatial_procs", &num_spatial_procs, "Number of spatial processors (set -1 for all available procs)");

    bool rebalance_stochastic_graph = false;
    CLP.setOption("rebalance", "no-rebalance", &rebalance_stochastic_graph, 
		  "Rebalance parallel stochastic graph (requires Isorropia)");

    SG_RF randField = UNIFORM;
    CLP.setOption("rand_field", &randField, 
		   num_sg_rf, sg_rf_values, sg_rf_names,
		  "Random field type");

    double mean = 0.2;
    CLP.setOption("mean", &mean, "Mean");

    double sigma = 0.1;
    CLP.setOption("std_dev", &sigma, "Standard deviation");

    double weightCut = 1.0;
    CLP.setOption("weight_cut", &weightCut, "Weight cut");

    int num_KL = 2;
    CLP.setOption("num_kl", &num_KL, "Number of KL terms");

    int p = 3;
    CLP.setOption("order", &p, "Polynomial order");

    bool normalize_basis = true;
    CLP.setOption("normalize", "unnormalize", &normalize_basis, 
		  "Normalize PC basis");
    
    SG_Solver solve_method = SG_KRYLOV;
    CLP.setOption("sg_solver", &solve_method, 
		  num_sg_solver, sg_solver_values, sg_solver_names, 
		  "SG solver method");

    Krylov_Method outer_krylov_method = GMRES;
    CLP.setOption("outer_krylov_method", &outer_krylov_method, 
		  num_krylov_method, krylov_method_values, krylov_method_names, 
		  "Outer Krylov method (for Krylov-based SG solver)");

    Krylov_Solver outer_krylov_solver = AZTECOO;
    CLP.setOption("outer_krylov_solver", &outer_krylov_solver, 
		  num_krylov_solver, krylov_solver_values, krylov_solver_names, 
		  "Outer linear solver");

    double outer_tol = 1e-12;
    CLP.setOption("outer_tol", &outer_tol, "Outer solver tolerance");

    int outer_its = 1000;
    CLP.setOption("outer_its", &outer_its, "Maximum outer iterations");

    Krylov_Method inner_krylov_method = GMRES;
    CLP.setOption("inner_krylov_method", &inner_krylov_method, 
		  num_krylov_method, krylov_method_values, krylov_method_names, 
		  "Inner Krylov method (for G-S, Jacobi, etc...)");

    Krylov_Solver inner_krylov_solver = AZTECOO;
    CLP.setOption("inner_krylov_solver", &inner_krylov_solver, 
		  num_krylov_solver, krylov_solver_values, krylov_solver_names, 
		  "Inner linear solver");

    double inner_tol = 3e-13;
    CLP.setOption("inner_tol", &inner_tol, "Inner solver tolerance");

    int inner_its = 1000;
    CLP.setOption("inner_its", &inner_its, "Maximum inner iterations");

    SG_Op opMethod = MATRIX_FREE;
    CLP.setOption("sg_operator_method", &opMethod, 
		  num_sg_op, sg_op_values, sg_op_names,
		  "Operator method");

    SG_Prec precMethod = AGS;
    CLP.setOption("sg_prec_method", &precMethod, 
		  num_sg_prec, sg_prec_values, sg_prec_names,
		  "Preconditioner method");

    double gs_prec_tol = 1e-1;
    CLP.setOption("gs_prec_tol", &gs_prec_tol, "Gauss-Seidel preconditioner tolerance");

    int gs_prec_its = 1;
    CLP.setOption("gs_prec_its", &gs_prec_its, "Maximum Gauss-Seidel preconditioner iterations");

    CLP.parse( argc, argv );

    if (MyPID == 0) {
      std::cout << "Summary of command line options:" << std::endl
		<< "\tnum_mesh            = " << n << std::endl
		<< "\tsymmetric           = " << symmetric << std::endl
		<< "\tnum_spatial_procs   = " << num_spatial_procs << std::endl
		<< "\trebalance           = " << rebalance_stochastic_graph
		<< std::endl
		<< "\trand_field          = " << sg_rf_names[randField] 
		<< std::endl
		<< "\tmean                = " << mean << std::endl
		<< "\tstd_dev             = " << sigma << std::endl
		<< "\tweight_cut          = " << weightCut << std::endl
		<< "\tnum_kl              = " << num_KL << std::endl
		<< "\torder               = " << p << std::endl
		<< "\tnormalize_basis     = " << normalize_basis << std::endl
		<< "\tsg_solver           = " << sg_solver_names[solve_method] 
		<< std::endl
		<< "\touter_krylov_method = " 
		<< krylov_method_names[outer_krylov_method] << std::endl
		<< "\touter_krylov_solver = " 
		<< krylov_solver_names[outer_krylov_solver] << std::endl
		<< "\touter_tol           = " << outer_tol << std::endl
		<< "\touter_its           = " << outer_its << std::endl
		<< "\tinner_krylov_method = " 
		<< krylov_method_names[inner_krylov_method] << std::endl
		<< "\tinner_krylov_solver = " 
		<< krylov_solver_names[inner_krylov_solver] << std::endl
		<< "\tinner_tol           = " << inner_tol << std::endl
		<< "\tinner_its           = " << inner_its << std::endl
		<< "\tsg_operator_method  = " << sg_op_names[opMethod] 
		<< std::endl
		<< "\tsg_prec_method      = " << sg_prec_names[precMethod] 
		<< std::endl
		<< "\tgs_prec_tol         = " << gs_prec_tol << std::endl
		<< "\tgs_prec_its         = " << gs_prec_its << std::endl;
    }

    bool nonlinear_expansion = false;
    if (randField == UNIFORM || randField == RYS)
      nonlinear_expansion = false;
    else if (randField == LOGNORMAL)
      nonlinear_expansion = true;
    bool scaleOP = true; 

    {
    TEUCHOS_FUNC_TIME_MONITOR("Total PCE Calculation Time");

    // Create Stochastic Galerkin basis and expansion
    Teuchos::Array< Teuchos::RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(num_KL); 
    for (int i=0; i<num_KL; i++)
      if (randField == UNIFORM)
        bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>(p,normalize_basis));
      else if (randField == RYS)
        bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>(p,weightCut,normalize_basis));
      else if (randField == LOGNORMAL)      
        bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>(p,normalize_basis));

    //  bases[i] = Teuchos::rcp(new Stokhos::DiscretizedStieltjesBasis<int,double>("beta",p,&uniform_weight,-weightCut,weightCut,true));
    Teuchos::RCP<const Stokhos::CompletePolynomialBasis<int,double> > basis = 
      Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>(bases));
    int sz = basis->size();
    Teuchos::RCP<Stokhos::Sparse3Tensor<int,double> > Cijk;
    if (nonlinear_expansion)
      Cijk = basis->computeTripleProductTensor(sz);
    else
      Cijk = basis->computeTripleProductTensor(num_KL+1);
    Teuchos::RCP<Stokhos::OrthogPolyExpansion<int,double> > expansion = 
      Teuchos::rcp(new Stokhos::AlgebraicOrthogPolyExpansion<int,double>(basis,
									 Cijk));
    if (MyPID == 0)
      std::cout << "Stochastic Galerkin expansion size = " << sz << std::endl;

    // Create stochastic parallel distribution
    Teuchos::ParameterList parallelParams;
    parallelParams.set("Number of Spatial Processors", num_spatial_procs);
    parallelParams.set("Rebalance Stochastic Graph", 
		       rebalance_stochastic_graph);
    Teuchos::RCP<Stokhos::ParallelData> sg_parallel_data =
      Teuchos::rcp(new Stokhos::ParallelData(basis, Cijk, globalComm,
					     parallelParams));
    Teuchos::RCP<const EpetraExt::MultiComm> sg_comm = 
      sg_parallel_data->getMultiComm();
    Teuchos::RCP<const Epetra_Comm> app_comm = 
      sg_parallel_data->getSpatialComm();
    
    // Create application
    Teuchos::RCP<twoD_diffusion_ME> model =
      Teuchos::rcp(new twoD_diffusion_ME(app_comm, n, num_KL, sigma, 
					 mean, basis, nonlinear_expansion,
					 symmetric));

    // Set up NOX parameters
    Teuchos::RCP<Teuchos::ParameterList> noxParams = 
      Teuchos::rcp(new Teuchos::ParameterList);

    // Set the nonlinear solver method
    noxParams->set("Nonlinear Solver", "Line Search Based");

    // Set the printing parameters in the "Printing" sublist
    Teuchos::ParameterList& printParams = noxParams->sublist("Printing");
    printParams.set("MyPID", MyPID); 
    printParams.set("Output Precision", 3);
    printParams.set("Output Processor", 0);
    printParams.set("Output Information", 
                    NOX::Utils::OuterIteration + 
                    NOX::Utils::OuterIterationStatusTest + 
                    NOX::Utils::InnerIteration +
		    //NOX::Utils::Parameters + 
		    NOX::Utils::Details + 
		    NOX::Utils::LinearSolverDetails +
                    NOX::Utils::Warning + 
                    NOX::Utils::Error);

    // Create printing utilities
    NOX::Utils utils(printParams);

    // Sublist for line search 
    Teuchos::ParameterList& searchParams = noxParams->sublist("Line Search");
    searchParams.set("Method", "Full Step");

    // Sublist for direction
    Teuchos::ParameterList& dirParams = noxParams->sublist("Direction");
    dirParams.set("Method", "Newton");
    Teuchos::ParameterList& newtonParams = dirParams.sublist("Newton");
    newtonParams.set("Forcing Term Method", "Constant");

    // Sublist for linear solver for the Newton method
    Teuchos::ParameterList& lsParams = newtonParams.sublist("Linear Solver");

    // Alternative linear solver list for Stratimikos
    Teuchos::ParameterList& stratLinSolParams = 
      newtonParams.sublist("Stratimikos Linear Solver");
    // Teuchos::ParameterList& noxStratParams = 
    //   stratLinSolParams.sublist("NOX Stratimikos Options");
    Teuchos::ParameterList& stratParams = 
      stratLinSolParams.sublist("Stratimikos");

    // Sublist for convergence tests
    Teuchos::ParameterList& statusParams = noxParams->sublist("Status Tests");
    statusParams.set("Test Type", "Combo");
    statusParams.set("Number of Tests", 2);
    statusParams.set("Combo Type", "OR");
    Teuchos::ParameterList& normF = statusParams.sublist("Test 0");
    normF.set("Test Type", "NormF");
    normF.set("Tolerance", outer_tol);
    normF.set("Scale Type", "Scaled");
    Teuchos::ParameterList& maxIters = statusParams.sublist("Test 1");
    maxIters.set("Test Type", "MaxIters");
    maxIters.set("Maximum Iterations", 1);

    // Create NOX interface
    Teuchos::RCP<NOX::Epetra::ModelEvaluatorInterface> det_nox_interface = 
       Teuchos::rcp(new NOX::Epetra::ModelEvaluatorInterface(model));

     // Create NOX linear system object
    Teuchos::RCP<const Epetra_Vector> det_u = model->get_x_init();
    Teuchos::RCP<Epetra_Operator> det_A = model->create_W();
    Teuchos::RCP<NOX::Epetra::Interface::Required> det_iReq = det_nox_interface;
    Teuchos::RCP<NOX::Epetra::Interface::Jacobian> det_iJac = det_nox_interface;
    Teuchos::ParameterList det_printParams;
    det_printParams.set("MyPID", MyPID); 
    det_printParams.set("Output Precision", 3);
    det_printParams.set("Output Processor", 0);
    det_printParams.set("Output Information", NOX::Utils::Error);
    
    Teuchos::ParameterList det_lsParams;
    Teuchos::ParameterList& det_stratParams = 
      det_lsParams.sublist("Stratimikos");
    if (inner_krylov_solver == AZTECOO) {
      det_stratParams.set("Linear Solver Type", "AztecOO");
      Teuchos::ParameterList& aztecOOParams = 
	det_stratParams.sublist("Linear Solver Types").sublist("AztecOO").sublist("Forward Solve");
      Teuchos::ParameterList& aztecOOSettings =
	aztecOOParams.sublist("AztecOO Settings");
      if (inner_krylov_method == GMRES) {
	aztecOOSettings.set("Aztec Solver","GMRES");
      }
      else if (inner_krylov_method == CG) {
	aztecOOSettings.set("Aztec Solver","CG");
      }
      aztecOOSettings.set("Output Frequency", 0);
      aztecOOSettings.set("Size of Krylov Subspace", 100);
      aztecOOParams.set("Max Iterations", inner_its);
      aztecOOParams.set("Tolerance", inner_tol);
      Teuchos::ParameterList& verbParams = 
	det_stratParams.sublist("Linear Solver Types").sublist("AztecOO").sublist("VerboseObject");
      verbParams.set("Verbosity Level", "none");
    }
    else if (inner_krylov_solver == BELOS) {
      det_stratParams.set("Linear Solver Type", "Belos");
      Teuchos::ParameterList& belosParams = 
	det_stratParams.sublist("Linear Solver Types").sublist("Belos");
      Teuchos::ParameterList* belosSolverParams = NULL;
      if (inner_krylov_method == GMRES || inner_krylov_method == FGMRES) {
	belosParams.set("Solver Type","Block GMRES");
	belosSolverParams = 
	  &(belosParams.sublist("Solver Types").sublist("Block GMRES"));
	if (inner_krylov_method == FGMRES)
	  belosSolverParams->set("Flexible Gmres", true);
      }
      else if (inner_krylov_method == CG) {
	belosParams.set("Solver Type","Block CG");
	belosSolverParams = 
	  &(belosParams.sublist("Solver Types").sublist("Block CG"));
      }
      else if (inner_krylov_method == RGMRES) {
      	belosParams.set("Solver Type","GCRODR");
      	belosSolverParams = 
      	  &(belosParams.sublist("Solver Types").sublist("GCRODR"));
      }
      belosSolverParams->set("Convergence Tolerance", inner_tol);
      belosSolverParams->set("Maximum Iterations", inner_its);
      belosSolverParams->set("Output Frequency",0);
      belosSolverParams->set("Output Style",1);
      belosSolverParams->set("Verbosity",0);
      Teuchos::ParameterList& verbParams = belosParams.sublist("VerboseObject");
      verbParams.set("Verbosity Level", "none");
    }
    det_stratParams.set("Preconditioner Type", "ML");
    Teuchos::ParameterList& det_ML = 
      det_stratParams.sublist("Preconditioner Types").sublist("ML").sublist("ML Settings");
    ML_Epetra::SetDefaults("SA", det_ML);
    det_ML.set("ML output", 0);
    det_ML.set("max levels",5);
    det_ML.set("increasing or decreasing","increasing");
    det_ML.set("aggregation: type", "Uncoupled");
    det_ML.set("smoother: type","ML symmetric Gauss-Seidel");
    det_ML.set("smoother: sweeps",2);
    det_ML.set("smoother: pre or post", "both");
    det_ML.set("coarse: max size", 200);
#ifdef HAVE_ML_AMESOS
    det_ML.set("coarse: type","Amesos-KLU");
#else
    det_ML.set("coarse: type","Jacobi");
#endif
    Teuchos::RCP<NOX::Epetra::LinearSystem> det_linsys = 
      Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos(
		     det_printParams, det_lsParams, det_iJac, 
		     det_A, *det_u));
    
    // Setup stochastic Galerkin algorithmic parameters
    Teuchos::RCP<Teuchos::ParameterList> sgParams = 
      Teuchos::rcp(new Teuchos::ParameterList);
    Teuchos::ParameterList& sgOpParams = 
      sgParams->sublist("SG Operator");
    Teuchos::ParameterList& sgPrecParams = 
      sgParams->sublist("SG Preconditioner");

    if (!nonlinear_expansion) {
      sgParams->set("Parameter Expansion Type", "Linear");
      sgParams->set("Jacobian Expansion Type", "Linear");
    }
    if (opMethod == MATRIX_FREE)
      sgOpParams.set("Operator Method", "Matrix Free");
    else if (opMethod == KL_MATRIX_FREE)
      sgOpParams.set("Operator Method", "KL Matrix Free");
    else if (opMethod == KL_REDUCED_MATRIX_FREE) {
      sgOpParams.set("Operator Method", "KL Reduced Matrix Free");
      if (randField == UNIFORM || randField == RYS)
	sgOpParams.set("Number of KL Terms", num_KL);
      else
	sgOpParams.set("Number of KL Terms", basis->size());
      sgOpParams.set("KL Tolerance", outer_tol);
      sgOpParams.set("Sparse 3 Tensor Drop Tolerance", outer_tol);
      sgOpParams.set("Do Error Tests", true);
    }
    else if (opMethod == FULLY_ASSEMBLED)
      sgOpParams.set("Operator Method", "Fully Assembled");
    else
      TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
		       "Error!  Unknown operator method " << opMethod
			 << "." << std::endl);
    if (precMethod == MEAN)  {
      sgPrecParams.set("Preconditioner Method", "Mean-based");
      sgPrecParams.set("Mean Preconditioner Type", "ML");
      Teuchos::ParameterList& precParams =
	sgPrecParams.sublist("Mean Preconditioner Parameters");
      precParams = det_ML;
    }
    else if(precMethod == GS) {
      sgPrecParams.set("Preconditioner Method", "Gauss-Seidel");
      sgPrecParams.sublist("Deterministic Solver Parameters") = det_lsParams;
      sgPrecParams.set("Deterministic Solver", det_linsys);
      sgPrecParams.set("Max Iterations", gs_prec_its);
      sgPrecParams.set("Tolerance", gs_prec_tol);
    }
    else if (precMethod == AGS)  {
      sgPrecParams.set("Preconditioner Method", "Approximate Gauss-Seidel");
      if (outer_krylov_method == CG)
	sgPrecParams.set("Symmetric Gauss-Seidel", true);
      sgPrecParams.set("Mean Preconditioner Type", "ML");
      Teuchos::ParameterList& precParams =
	sgPrecParams.sublist("Mean Preconditioner Parameters");
      precParams = det_ML;
    }
    else if (precMethod == AJ)  {
      sgPrecParams.set("Preconditioner Method", "Approximate Jacobi");
      sgPrecParams.set("Mean Preconditioner Type", "ML");
      Teuchos::ParameterList& precParams =
        sgPrecParams.sublist("Mean Preconditioner Parameters");
      precParams = det_ML;
      Teuchos::ParameterList& jacobiOpParams =
	sgPrecParams.sublist("Jacobi SG Operator");
      jacobiOpParams.set("Only Use Linear Terms", true);
    }
    else if (precMethod == ASC)  {
      sgPrecParams.set("Preconditioner Method", "Approximate Schur Complement");
      sgPrecParams.set("Mean Preconditioner Type", "ML");
      Teuchos::ParameterList& precParams =
	sgPrecParams.sublist("Mean Preconditioner Parameters");
      precParams = det_ML;
    }
    else if (precMethod == KP)  {
      sgPrecParams.set("Preconditioner Method", "Kronecker Product");
      sgPrecParams.set("Only Use Linear Terms", true);
      sgPrecParams.set("Mean Preconditioner Type", "ML");
      Teuchos::ParameterList& meanPrecParams =
        sgPrecParams.sublist("Mean Preconditioner Parameters");
      meanPrecParams = det_ML;
      sgPrecParams.set("G Preconditioner Type", "Ifpack");
      Teuchos::ParameterList& GPrecParams =
        sgPrecParams.sublist("G Preconditioner Parameters");
      if (outer_krylov_method == GMRES || outer_krylov_method == FGMRES)
	GPrecParams.set("Ifpack Preconditioner", "ILUT");
      if (outer_krylov_method == CG)
	GPrecParams.set("Ifpack Preconditioner", "ICT");
      GPrecParams.set("Overlap", 1);
      GPrecParams.set("fact: drop tolerance", 1e-4);
      GPrecParams.set("fact: ilut level-of-fill", 1.0);
      GPrecParams.set("schwarz: combine mode", "Add");
    }
    else if (precMethod == NONE)  {
      sgPrecParams.set("Preconditioner Method", "None");
    }
    else
      TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
		       "Error!  Unknown preconditioner method " << precMethod
			 << "." << std::endl);

    // Create stochastic Galerkin model evaluator
    Teuchos::RCP<Stokhos::SGModelEvaluator> sg_model =
      Teuchos::rcp(new Stokhos::SGModelEvaluator(model, basis, Teuchos::null,
                                                 expansion, sg_parallel_data, 
						 sgParams, scaleOP));
    EpetraExt::ModelEvaluator::InArgs sg_inArgs = sg_model->createInArgs();
    EpetraExt::ModelEvaluator::OutArgs sg_outArgs = 
      sg_model->createOutArgs();

    // Set up stochastic parameters
    Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_p_init =
      sg_model->create_p_sg(0);
    for (int i=0; i<num_KL; i++) {
      sg_p_init->term(i,0)[i] = 0.0;
      sg_p_init->term(i,1)[i] = 1.0;
    }
    sg_model->set_p_sg_init(0, *sg_p_init);

    // Setup stochastic initial guess
    Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_x_init = 
      sg_model->create_x_sg();
    sg_x_init->init(0.0);
    sg_model->set_x_sg_init(*sg_x_init);

     // Create NOX interface
    Teuchos::RCP<NOX::Epetra::ModelEvaluatorInterface> nox_interface =
       Teuchos::rcp(new NOX::Epetra::ModelEvaluatorInterface(sg_model));

    // Create NOX stochastic linear system object
    Teuchos::RCP<const Epetra_Vector> u = sg_model->get_x_init();
    Teuchos::RCP<const Epetra_Map> base_map = model->get_x_map();
    Teuchos::RCP<const Epetra_Map> sg_map = sg_model->get_x_map();
    Teuchos::RCP<Epetra_Operator> A = sg_model->create_W();
    Teuchos::RCP<NOX::Epetra::Interface::Required> iReq = nox_interface;
    Teuchos::RCP<NOX::Epetra::Interface::Jacobian> iJac = nox_interface;

    // Build linear solver
    Teuchos::RCP<NOX::Epetra::LinearSystem> linsys;
    if (solve_method==SG_KRYLOV) {
      bool has_M = 
	sg_outArgs.supports(EpetraExt::ModelEvaluator::OUT_ARG_WPrec);
      Teuchos::RCP<Epetra_Operator> M;
      Teuchos::RCP<NOX::Epetra::Interface::Preconditioner> iPrec;
      if (has_M) {
	M = sg_model->create_WPrec()->PrecOp;
	iPrec = nox_interface;
      }
      stratParams.set("Preconditioner Type", "None");
      if (outer_krylov_solver == AZTECOO) {
	stratParams.set("Linear Solver Type", "AztecOO");
	Teuchos::ParameterList& aztecOOParams = 
	  stratParams.sublist("Linear Solver Types").sublist("AztecOO").sublist("Forward Solve");
	Teuchos::ParameterList& aztecOOSettings =
	  aztecOOParams.sublist("AztecOO Settings");
	if (outer_krylov_method == GMRES) {
	  aztecOOSettings.set("Aztec Solver","GMRES");
	}
	else if (outer_krylov_method == CG) {
	  aztecOOSettings.set("Aztec Solver","CG");
	}
	aztecOOSettings.set("Output Frequency", 1);
	aztecOOSettings.set("Size of Krylov Subspace", 100);
	aztecOOParams.set("Max Iterations", outer_its);
	aztecOOParams.set("Tolerance", outer_tol);
	stratLinSolParams.set("Preconditioner", "User Defined");
	if (has_M)
	  linsys = 
	    Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos(
			   printParams, stratLinSolParams, iJac, A, iPrec, M, 
			   *u, true));
	else
	  linsys = 
	    Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos(
			   printParams, stratLinSolParams, iJac, A, *u));
      }
      else if (outer_krylov_solver == BELOS){
	stratParams.set("Linear Solver Type", "Belos");
	Teuchos::ParameterList& belosParams = 
	  stratParams.sublist("Linear Solver Types").sublist("Belos");
	Teuchos::ParameterList* belosSolverParams = NULL;
	if (outer_krylov_method == GMRES || outer_krylov_method == FGMRES) {
	  belosParams.set("Solver Type","Block GMRES");
	  belosSolverParams = 
	    &(belosParams.sublist("Solver Types").sublist("Block GMRES"));
	  if (outer_krylov_method == FGMRES)
	    belosSolverParams->set("Flexible Gmres", true);
	}
	else if (outer_krylov_method == CG) {
	  belosParams.set("Solver Type","Block CG");
	  belosSolverParams = 
	    &(belosParams.sublist("Solver Types").sublist("Block CG"));
	}
	else if (inner_krylov_method == RGMRES) {
	  belosParams.set("Solver Type","GCRODR");
	  belosSolverParams = 
	    &(belosParams.sublist("Solver Types").sublist("GCRODR"));
	}
	belosSolverParams->set("Convergence Tolerance", outer_tol);
	belosSolverParams->set("Maximum Iterations", outer_its);
	belosSolverParams->set("Output Frequency",1);
	belosSolverParams->set("Output Style",1);
	belosSolverParams->set("Verbosity",33);
	stratLinSolParams.set("Preconditioner", "User Defined");
	if (has_M)
	  linsys = 
	    Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos(
			   printParams, stratLinSolParams, iJac, A, iPrec, M, 
			   *u, true));
	else
	  linsys = 
	    Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos(
			   printParams, stratLinSolParams, iJac, A, *u));
	  
      }
    }
    else if (solve_method==SG_GS) {
      lsParams.sublist("Deterministic Solver Parameters") = det_lsParams;
      lsParams.set("Max Iterations", outer_its);
      lsParams.set("Tolerance", outer_tol);
      linsys =
	Teuchos::rcp(new NOX::Epetra::LinearSystemSGGS(
		       printParams, lsParams, det_linsys, iReq, iJac, 
		       basis, sg_parallel_data, A, base_map, sg_map));
    }
    else {
      lsParams.sublist("Deterministic Solver Parameters") = det_lsParams;
      lsParams.set("Max Iterations", outer_its);
      lsParams.set("Tolerance", outer_tol);
      Teuchos::ParameterList& jacobiOpParams =
	lsParams.sublist("Jacobi SG Operator");
      jacobiOpParams.set("Only Use Linear Terms", true);
      linsys =
	Teuchos::rcp(new NOX::Epetra::LinearSystemSGJacobi(
		       printParams, lsParams, det_linsys, iReq, iJac, 
		       basis, sg_parallel_data, A, base_map, sg_map));
    }

    // Build NOX group
    Teuchos::RCP<NOX::Epetra::Group> grp = 
      Teuchos::rcp(new NOX::Epetra::Group(printParams, iReq, *u, linsys));
    
    // Create the Solver convergence test
    Teuchos::RCP<NOX::StatusTest::Generic> statusTests =
      NOX::StatusTest::buildStatusTests(statusParams, utils);

    // Create the solver
    Teuchos::RCP<NOX::Solver::Generic> solver = 
      NOX::Solver::buildSolver(grp, statusTests, noxParams);

    // Solve the system
    NOX::StatusTest::StatusType status;
    {
      TEUCHOS_FUNC_TIME_MONITOR("Total Solve Time");
      status = solver->solve();
    }

    // Get final solution
    const NOX::Epetra::Group& finalGroup = 
      dynamic_cast<const NOX::Epetra::Group&>(solver->getSolutionGroup());
    const Epetra_Vector& finalSolution = 
      (dynamic_cast<const NOX::Epetra::Vector&>(finalGroup.getX())).getEpetraVector();

    // Save final solution to file
    EpetraExt::VectorToMatrixMarketFile("nox_solver_stochastic_solution.mm", 
					finalSolution);

    // Save mean and variance to file
    Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_x_poly = 
      sg_model->create_x_sg(View, &finalSolution);
    Epetra_Vector mean(*(model->get_x_map()));
    Epetra_Vector std_dev(*(model->get_x_map()));
    sg_x_poly->computeMean(mean);
    sg_x_poly->computeStandardDeviation(std_dev);
    EpetraExt::VectorToMatrixMarketFile("mean_gal.mm", mean);
    EpetraExt::VectorToMatrixMarketFile("std_dev_gal.mm", std_dev);
      
    // Evaluate SG responses at SG parameters
    Teuchos::RCP<const Epetra_Vector> sg_p = sg_model->get_p_init(1);
    Teuchos::RCP<Epetra_Vector> sg_g = 
      Teuchos::rcp(new Epetra_Vector(*(sg_model->get_g_map(0))));
    sg_inArgs.set_p(1, sg_p);
    sg_inArgs.set_x(Teuchos::rcp(&finalSolution,false));
    sg_outArgs.set_g(0, sg_g);
    sg_model->evalModel(sg_inArgs, sg_outArgs);

    // Print mean and standard deviation of response
    Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_g_poly =
      sg_model->create_g_sg(0, View, sg_g.get());
    Epetra_Vector g_mean(*(model->get_g_map(0)));
    Epetra_Vector g_std_dev(*(model->get_g_map(0)));
    sg_g_poly->computeMean(g_mean);
    sg_g_poly->computeStandardDeviation(g_std_dev);
    std::cout.precision(16);
    // std::cout << "\nResponse Expansion = " << std::endl;
    // std::cout.precision(12);
    // sg_g_poly->print(std::cout);
    std::cout << "\nResponse Mean =      " << std::endl << g_mean << std::endl;
    std::cout << "Response Std. Dev. = " << std::endl << g_std_dev << std::endl;

    if (status == NOX::StatusTest::Converged && MyPID == 0) 
      utils.out() << "Example Passed!" << std::endl;

    }

    Teuchos::TimeMonitor::summarize(std::cout);
    Teuchos::TimeMonitor::zeroOutTimers();

  }
  
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }
  catch (string& s) {
    std::cout << s << std::endl;
  }
  catch (char *s) {
    std::cout << s << std::endl;
  }
  catch (...) {
    std::cout << "Caught unknown exception!" <<std:: endl;
  }

#ifdef HAVE_MPI
  MPI_Finalize() ;
#endif

}
Example #8
0
int main(int argc, char *argv[])
{
  bool success = true;
  bool verbose = false;
  try {

    Teuchos::oblackholestream blackHole;
    Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);

    Teuchos::RCP<const Teuchos::Comm<int> > comm =
      Teuchos::DefaultComm<int>::getComm();

    const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
    const size_t num_cores_per_socket =
      Kokkos::hwloc::get_available_cores_per_numa();
    const size_t num_threads_per_core =
      Kokkos::hwloc::get_available_threads_per_core();

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This test performance of MP::Vector FEM assembly.\n");
    int nGrid = 32;
    CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
    int nIter = 10;
    CLP.setOption("ni", &nIter, "Number of assembly iterations");
    bool print = false;
    CLP.setOption("print", "no-print", &print, "Print debugging output");
    bool check = false;
    int num_cores = num_cores_per_socket * num_sockets;
    CLP.setOption("cores", &num_cores,
                  "Number of CPU cores to use (defaults to all)");
    int num_hyper_threads = num_threads_per_core;
    CLP.setOption("hyperthreads", &num_hyper_threads,
                  "Number of hyper threads per core to use (defaults to all)");
    int threads_per_vector = 1;
    CLP.setOption("threads_per_vector", &threads_per_vector,
                  "Number of threads to use within each vector");
    CLP.setOption("check", "no-check", &check, "Check correctness");
#ifdef KOKKOS_HAVE_SERIAL
    bool serial = true;
    CLP.setOption("serial", "no-serial", &serial, "Enable Serial device");
#endif
#ifdef KOKKOS_HAVE_PTHREAD
    bool threads = true;
    CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
#endif
#ifdef KOKKOS_HAVE_OPENMP
    bool openmp = true;
    CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device");
#endif
#ifdef KOKKOS_HAVE_CUDA
    bool cuda = true;
    CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
    int cuda_threads_per_vector = 16;
    CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
                  "Number of Cuda threads to use within each vector");
    int cuda_block_size = 256;
    CLP.setOption("cuda_block_size", &cuda_block_size,
                  "Cuda block size");
    int num_cuda_blocks = 0;
    CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
                  "Number of Cuda blocks (0 implies the default choice)");
    int device_id = -1;
    CLP.setOption("device", &device_id, "CUDA device ID.  Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
    int ngpus = 1;
    CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI");
#endif
    CLP.parse( argc, argv );

    int use_nodes[3];
    use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;

    typedef int Ordinal;
    typedef double Scalar;
    const Kokkos::Example::FENL::AssemblyMethod Method =
      Kokkos::Example::FENL::FadElementOptimized;
    // const Kokkos::Example::FENL::AssemblyMethod Method =
    //   Kokkos::Example::FENL::Analytic;

#ifdef KOKKOS_HAVE_SERIAL
    if (serial) {
      typedef Kokkos::Serial Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Serial::initialize();

      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "Serial performance with " << comm->getSize()
                  << " MPI ranks" << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1);

      mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::Serial::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_PTHREAD
    if (threads) {
      typedef Kokkos::Threads Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Threads::initialize(num_cores*num_hyper_threads);

      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "Threads performance with " << comm->getSize()
                  << " MPI ranks and " << num_cores*num_hyper_threads
                  << " threads per rank:" << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
                                       threads_per_vector,
                                       num_hyper_threads / threads_per_vector);

      mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::Threads::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_OPENMP
    if (openmp) {
      typedef Kokkos::OpenMP Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::OpenMP::initialize(num_cores*num_hyper_threads);

      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "OpenMP performance with " << comm->getSize()
                  << " MPI ranks and " << num_cores*num_hyper_threads
                  << " threads per rank:" << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
                                       threads_per_vector,
                                       num_hyper_threads / threads_per_vector);

      mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::OpenMP::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_CUDA
    if (cuda) {
      typedef Kokkos::Cuda Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      if (device_id == -1) {
        int local_rank = 0;
        char *str;
        if ((str = std::getenv("SLURM_LOCALID")))
          local_rank = std::atoi(str);
        else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
          local_rank = std::atoi(str);
        else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
          local_rank = std::atoi(str);
        device_id = local_rank % ngpus;

        // Check device is valid
        int num_device; cudaGetDeviceCount(&num_device);
        TEUCHOS_TEST_FOR_EXCEPTION(
          device_id >= num_device, std::logic_error,
          "Invalid device ID " << device_id << ".  You probably are trying" <<
          " to run with too many GPUs per node");
      }

      Kokkos::HostSpace::execution_space::initialize();
      Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));

      cudaDeviceProp deviceProp;
      cudaGetDeviceProperties(&deviceProp, device_id);
      if (comm->getRank() == 0)
        std::cout << std::endl
                  << "CUDA performance performance with " << comm->getSize()
                  << " MPI ranks and device " << device_id << " ("
                  << deviceProp.name << "):"
                  << std::endl;

      Kokkos::Example::FENL::DeviceConfig dev_config(
        num_cuda_blocks,
        cuda_threads_per_vector,
        cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);

      mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
                               dev_config);

      Kokkos::HostSpace::execution_space::finalize();
      Kokkos::Cuda::finalize();
    }
#endif

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}
Example #9
0
int main(int argc, char *argv[]) {
  typedef double MeshScalar;
  typedef double BasisScalar;
  typedef Tpetra::DefaultPlatform::DefaultPlatformType::NodeType Node;
  typedef Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType;

  //double g_mean_exp = 1.906587e-01;      // expected response mean
  //double g_std_dev_exp = 8.680605e-02;  // expected response std. dev.
  //double g_tol = 1e-6;               // tolerance on determining success



  using Teuchos::RCP;
  using Teuchos::rcp;
  using Teuchos::Array;
  using Teuchos::ArrayRCP;
  using Teuchos::ArrayView;
  using Teuchos::ParameterList;

// Initialize MPI
#ifdef HAVE_MPI
  MPI_Init(&argc,&argv);
#endif

//  feenableexcept(FE_ALL_EXCEPT);

  LocalOrdinal MyPID;

  try {

    // Create a communicator for Epetra objects
    RCP<const Epetra_Comm> globalComm;
#ifdef HAVE_MPI
    globalComm = rcp(new Epetra_MpiComm(MPI_COMM_WORLD));
#else
    globalComm = rcp(new Epetra_SerialComm);
#endif
    MyPID = globalComm->MyPID();

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This example runs an interlaced stochastic Galerkin solvers.\n");

    int n = 32;
    CLP.setOption("num_mesh", &n, "Number of mesh points in each direction");

    // multigrid specific options
    int minAggSize = 1;
    CLP.setOption("min_agg_size", &minAggSize, "multigrid aggregate size");
    int smootherSweeps = 3;
    CLP.setOption("smoother_sweeps", &smootherSweeps, "# multigrid smoother sweeps");
    int plainAgg=1;
    CLP.setOption("plain_aggregation", &plainAgg, "plain aggregation");
    LocalOrdinal nsSize=-1;
    CLP.setOption("nullspace_size", &nsSize, "nullspace dimension");


    bool symmetric = false;
    CLP.setOption("symmetric", "unsymmetric", &symmetric, 
                  "Symmetric discretization");

    int num_spatial_procs = -1;
    CLP.setOption("num_spatial_procs", &num_spatial_procs, "Number of spatial processors (set -1 for all available procs)");

    SG_RF randField = UNIFORM;
    CLP.setOption("rand_field", &randField, 
                  num_sg_rf, sg_rf_values, sg_rf_names,
                  "Random field type");

    double mu = 0.2;
    CLP.setOption("mean", &mu, "Mean");

    double s = 0.1;
    CLP.setOption("std_dev", &s, "Standard deviation");

    int num_KL = 2;
    CLP.setOption("num_kl", &num_KL, "Number of KL terms");

    int order = 3;
    CLP.setOption("order", &order, "Polynomial order");

    bool normalize_basis = true;
    CLP.setOption("normalize", "unnormalize", &normalize_basis, 
                  "Normalize PC basis");

    Krylov_Method solver_method = GMRES;
    CLP.setOption("solver_method", &solver_method, 
                  num_krylov_method, krylov_method_values, krylov_method_names, 
                  "Krylov solver method");

    SG_Prec prec_method = STOCHASTIC;
    CLP.setOption("prec_method", &prec_method, 
                  num_sg_prec, sg_prec_values, sg_prec_names,
                  "Preconditioner method");

    SG_Div division_method = DIRECT;
    CLP.setOption("division_method", &division_method, 
                  num_sg_div, sg_div_values, sg_div_names,
                  "Stochastic division method");

    SG_DivPrec divprec_method = NO;
    CLP.setOption("divprec_method", &divprec_method,
                  num_sg_divprec, sg_divprec_values, sg_divprec_names,
                  "Preconditioner for division method");
    Schur_option schur_option = diag;
    CLP.setOption("schur_option", &schur_option,
                  num_schur_option, Schur_option_values, schur_option_names,
                  "Schur option");
    Prec_option prec_option = whole;
    CLP.setOption("prec_option", &prec_option,
                  num_prec_option, Prec_option_values, prec_option_names,
                  "Prec option");


    double solver_tol = 1e-12;
    CLP.setOption("solver_tol", &solver_tol, "Outer solver tolerance");

    double div_tol = 1e-6;
    CLP.setOption("div_tol", &div_tol, "Tolerance in Iterative Solver");
    
    int prec_level = 1;
    CLP.setOption("prec_level", &prec_level, "Level in Schur Complement Prec 0->Solve A0u0=g0 with division; 1->Form 1x1 Schur Complement");

    int max_it_div = 50;
    CLP.setOption("max_it_div", &max_it_div, "Maximum # of Iterations in Iterative Solver for Division");

    bool equilibrate = true; //JJH 8/26/12 changing to true to match ETP example
    CLP.setOption("equilibrate", "noequilibrate", &equilibrate,
                  "Equilibrate the linear system");


    CLP.parse( argc, argv );

    if (MyPID == 0) {
      std::cout << "Summary of command line options:" << std::endl
                << "\tnum_mesh           = " << n << std::endl
                << "\tsymmetric          = " << symmetric << std::endl
                << "\tnum_spatial_procs  = " << num_spatial_procs << std::endl
                << "\trand_field         = " << sg_rf_names[randField] 
                << std::endl
                << "\tmean               = " << mu << std::endl
                << "\tstd_dev            = " << s << std::endl
                << "\tnum_kl             = " << num_KL << std::endl
                << "\torder              = " << order << std::endl
                << "\tnormalize_basis    = " << normalize_basis << std::endl
                << "\tsolver_method      = " << krylov_method_names[solver_method] << std::endl
                << "\tprec_method        = " << sg_prec_names[prec_method]    << std::endl
                << "\tdivision_method    = " << sg_div_names[division_method]     << std::endl
                << "\tdiv_tol            = " << div_tol << std::endl
                << "\tdiv_prec           = " << sg_divprec_names[divprec_method]      << std::endl
                << "\tprec_level         = " << prec_level << std::endl
                << "\tmax_it_div     = " << max_it_div << std::endl;
    }
    bool nonlinear_expansion = false;
    if (randField == UNIFORM)
      nonlinear_expansion = false;
    else if (randField == LOGNORMAL)
      nonlinear_expansion = true;

    {
    TEUCHOS_FUNC_TIME_MONITOR("Total PCE Calculation Time");

    // Create Stochastic Galerkin basis and expansion
    Teuchos::Array< RCP<const Stokhos::OneDOrthogPolyBasis<LocalOrdinal,BasisScalar> > > bases(num_KL); 
    for (LocalOrdinal i=0; i<num_KL; i++)
      if (randField == UNIFORM)
        bases[i] = rcp(new Stokhos::LegendreBasis<LocalOrdinal,BasisScalar>(order, normalize_basis));
      else if (randField == LOGNORMAL)
        bases[i] = rcp(new Stokhos::HermiteBasis<int,double>(order, normalize_basis));
    RCP<const Stokhos::CompletePolynomialBasis<LocalOrdinal,BasisScalar> > basis = 
      rcp(new Stokhos::CompletePolynomialBasis<LocalOrdinal,BasisScalar>(bases, 1e-12));
    LocalOrdinal sz = basis->size();
    RCP<Stokhos::Sparse3Tensor<LocalOrdinal,BasisScalar> > Cijk = 
      basis->computeTripleProductTensor(sz);
    RCP<const Stokhos::Quadrature<int,double> > quad = 
      rcp(new Stokhos::TensorProductQuadrature<int,double>(basis));
    RCP<ParameterList> expn_params = Teuchos::rcp(new ParameterList);
    if (division_method == MEAN_DIV) {
      expn_params->set("Division Strategy", "Mean-Based");
      expn_params->set("Use Quadrature for Division", false);
    }
    else if (division_method == DIRECT) {
      expn_params->set("Division Strategy", "Dense Direct");
      expn_params->set("Use Quadrature for Division", false);
    }
    else if (division_method == SPD_DIRECT) {
      expn_params->set("Division Strategy", "SPD Dense Direct");
      expn_params->set("Use Quadrature for Division", false);
    }
    else if (division_method == CGD) {
      expn_params->set("Division Strategy", "CG");
      expn_params->set("Use Quadrature for Division", false);
    }

    else if (division_method == QUAD) {
      expn_params->set("Use Quadrature for Division", true);
    }

    if (divprec_method == NO)
         expn_params->set("Prec Strategy", "None");
    else if (divprec_method == DIAG)
         expn_params->set("Prec Strategy", "Diag");
    else if (divprec_method == JACOBI)
         expn_params->set("Prec Strategy", "Jacobi");
    else if (divprec_method == GS)
         expn_params->set("Prec Strategy", "GS");
    else if (divprec_method == SCHUR)
         expn_params->set("Prec Strategy", "Schur");

    if (schur_option == diag)
        expn_params->set("Schur option", "diag");
    else
        expn_params->set("Schur option", "full");
    if (prec_option == linear)
        expn_params->set("Prec option", "linear");


    if (equilibrate)
      expn_params->set("Equilibrate", 1);
    else
      expn_params->set("Equilibrate", 0); 
    expn_params->set("Division Tolerance", div_tol);
    expn_params->set("prec_iter", prec_level);
    expn_params->set("max_it_div", max_it_div);

    RCP<Stokhos::OrthogPolyExpansion<LocalOrdinal,BasisScalar> > expansion = 
      rcp(new Stokhos::QuadOrthogPolyExpansion<LocalOrdinal,BasisScalar>(
            basis, Cijk, quad, expn_params));

    if (MyPID == 0)
      std::cout << "Stochastic Galerkin expansion size = " << sz << std::endl;

    // Create stochastic parallel distribution
    ParameterList parallelParams;
    parallelParams.set("Number of Spatial Processors", num_spatial_procs);
    // parallelParams.set("Rebalance Stochastic Graph", true);
    // Teuchos::ParameterList& isorropia_params = 
    //   parallelParams.sublist("Isorropia");
    // isorropia_params.set("Balance objective", "nonzeros");
    RCP<Stokhos::ParallelData> sg_parallel_data =
      rcp(new Stokhos::ParallelData(basis, Cijk, globalComm, parallelParams));
    RCP<const EpetraExt::MultiComm> sg_comm = 
      sg_parallel_data->getMultiComm();
    RCP<const Epetra_Comm> app_comm = 
      sg_parallel_data->getSpatialComm();

    // Create Teuchos::Comm from Epetra_Comm
    RCP< Teuchos::Comm<int> > teuchos_app_comm;
#ifdef HAVE_MPI
    RCP<const Epetra_MpiComm> app_mpi_comm = 
      Teuchos::rcp_dynamic_cast<const Epetra_MpiComm>(app_comm);
    RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > raw_mpi_comm = 
      Teuchos::opaqueWrapper(app_mpi_comm->Comm());
    teuchos_app_comm = rcp(new Teuchos::MpiComm<int>(raw_mpi_comm));
#else
    teuchos_app_comm = rcp(new Teuchos::SerialComm<int>());
#endif

    // Create application
    typedef twoD_diffusion_problem<Scalar,MeshScalar,BasisScalar,LocalOrdinal,GlobalOrdinal,Node> problem_type;
    RCP<problem_type> model = 
      rcp(new problem_type(teuchos_app_comm, n, num_KL, s, mu, 
               nonlinear_expansion, symmetric));

    // Create vectors and operators
    typedef problem_type::Tpetra_Vector Tpetra_Vector;
    typedef problem_type::Tpetra_CrsMatrix Tpetra_CrsMatrix;
    typedef Tpetra::MatrixMarket::Writer<Tpetra_CrsMatrix> Writer;
    //Xpetra matrices
    typedef Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_CrsMatrix;
    typedef Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> Xpetra_MultiVector;
    typedef Xpetra::MultiVectorFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node> Xpetra_MultiVectorFactory;
    typedef Xpetra::Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_Operator;
    typedef Xpetra::TpetraCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_TpetraCrsMatrix;
    typedef Xpetra::CrsOperator<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_CrsOperator;
    typedef Belos::MueLuOp<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Belos_MueLuOperator;
    //MueLu typedefs
    typedef MueLu::Hierarchy<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> MueLu_Hierarchy;
    typedef MueLu::SmootherPrototype<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> SmootherPrototype;
    typedef MueLu::TrilinosSmoother<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> TrilinosSmoother;
    typedef MueLu::SmootherFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> SmootherFactory;
    typedef MueLu::FactoryManager<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> FactoryManager;

    RCP<Tpetra_Vector> p = Tpetra::createVector<Scalar>(model->get_p_map(0));
    RCP<Tpetra_Vector> x = Tpetra::createVector<Scalar>(model->get_x_map());
    x->putScalar(0.0);
    RCP<Tpetra_Vector> f = Tpetra::createVector<Scalar>(model->get_f_map());
    RCP<Tpetra_Vector> dx = Tpetra::createVector<Scalar>(model->get_x_map());
    RCP<Tpetra_CrsMatrix> J = model->create_W();
    RCP<Tpetra_CrsMatrix> J0;
    if (prec_method == MEAN)
      J0 = model->create_W();

    // Set PCE expansion of p
    p->putScalar(0.0);
    ArrayRCP<Scalar> p_view = p->get1dViewNonConst();
    for (ArrayRCP<Scalar>::size_type i=0; i<p_view.size(); i++) {
      p_view[i].reset(expansion);
      p_view[i].copyForWrite();
    }
    Array<double> point(num_KL, 1.0);
    Array<double> basis_vals(sz);
    basis->evaluateBases(point, basis_vals);
    if (order > 0) {
      for (int i=0; i<num_KL; i++) {
        p_view[i].term(i,1) = 1.0 / basis_vals[i+1];
      }
    }

    // Create preconditioner
    typedef Ifpack2::Preconditioner<Scalar,LocalOrdinal,GlobalOrdinal,Node> Tprec;
    RCP<Belos_MueLuOperator> M;
    RCP<MueLu_Hierarchy> H;
    RCP<Xpetra_CrsMatrix> xcrsJ = rcp(new Xpetra_TpetraCrsMatrix(J));
    RCP<Xpetra_Operator> xopJ = rcp(new Xpetra_CrsOperator(xcrsJ));
    if (prec_method != NONE) {
      ParameterList precParams;
      std::string prec_name = "RILUK";
      precParams.set("fact: iluk level-of-fill", 1);
      precParams.set("fact: iluk level-of-overlap", 0);
      //Ifpack2::Factory factory;
      RCP<Xpetra_Operator> xopJ0;
      if (prec_method == MEAN) {
        RCP<Xpetra_CrsMatrix> xcrsJ0 = rcp(new Xpetra_TpetraCrsMatrix(J0));
        xopJ0 = rcp(new Xpetra_CrsOperator(xcrsJ0));
        //M = factory.create<Tpetra_CrsMatrix>(prec_name, J0);
      } else if (prec_method == STOCHASTIC) {
        xopJ0 = xopJ;
        //M = factory.create<Tpetra_CrsMatrix>(prec_name, J);
      }
      H = rcp(new MueLu_Hierarchy(xopJ0));
      M = rcp(new Belos_MueLuOperator(H));
      //M->setParameters(precParams);
      if (nsSize!=-1) sz=nsSize;
      RCP<Xpetra_MultiVector> Z = Xpetra_MultiVectorFactory::Build(xcrsJ->getDomainMap(), sz);
      size_t n = Z->getLocalLength();
      for (LocalOrdinal j=0; j<sz; ++j) {
        ArrayRCP<Scalar> col = Z->getDataNonConst(j);
        for (size_t i=0; i<n; ++i) {
          col[i].reset(expansion);
          col[i].copyForWrite();
          col[i].fastAccessCoeff(j) = 1.0;
        }
      }
      H->GetLevel(0)->Set("Nullspace", Z);
      //RCP<Teuchos::FancyOStream> fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout));
      //fos->setOutputToRootOnly(-1);
      //Z->describe(*fos);
    }

    // Evaluate model
    model->computeResidual(*x, *p, *f);
    model->computeJacobian(*x, *p, *J);

    // Compute mean for mean-based preconditioner
    if (prec_method == MEAN) {
      size_t nrows = J->getNodeNumRows();
      ArrayView<const LocalOrdinal> indices;
      ArrayView<const Scalar> values;
      J0->resumeFill();
      for (size_t i=0; i<nrows; i++) {
        J->getLocalRowView(i, indices, values);
        Array<Scalar> values0(values.size());
        for (LocalOrdinal j=0; j<values.size(); j++)
          values0[j] = values[j].coeff(0);
        J0->replaceLocalValues(i, indices, values0);
      }
      J0->fillComplete();
    }

    // compute preconditioner
    if (prec_method != NONE) {
      //M->initialize();
      //M->compute();

      //override MueLu defaults via factory manager
      RCP<FactoryManager> fm = rcp( new FactoryManager() );;

      //smoother
      ParameterList smootherParamList;
      /*
      smootherParamList.set("chebyshev: degree", smootherSweeps);
      smootherParamList.set("chebyshev: ratio eigenvalue", (double) 20);
      smootherParamList.set("chebyshev: max eigenvalue", (double) -1.0);
      smootherParamList.set("chebyshev: min eigenvalue", (double) 1.0);
      smootherParamList.set("chebyshev: zero starting solution", true);
      RCP<SmootherPrototype> smooPrototype     = rcp( new TrilinosSmoother("CHEBYSHEV", smootherParamList) );
      */
      smootherParamList.set("relaxation: sweeps", smootherSweeps);
      smootherParamList.set("relaxation: type", "Symmetric Gauss-Seidel");
      RCP<SmootherPrototype> smooPrototype     = rcp( new TrilinosSmoother("RELAXATION", smootherParamList) );

      RCP<SmootherFactory>   smooFact      = rcp( new SmootherFactory(smooPrototype) );
      fm->SetFactory("Smoother", smooFact);

      // coarse level solve
      ParameterList coarseParamList;
      coarseParamList.set("fact: level-of-fill", 0);
      RCP<SmootherPrototype> coarsePrototype     = rcp( new TrilinosSmoother("ILUT", coarseParamList) );
      RCP<SmootherFactory>   coarseSolverFact      = rcp( new SmootherFactory(coarsePrototype, Teuchos::null) );
      fm->SetFactory("CoarseSolver", coarseSolverFact);

      //allow for larger aggregates
      typedef MueLu::UCAggregationFactory<LocalOrdinal,GlobalOrdinal,Node,LocalMatOps>
      MueLu_UCAggregationFactory;
      RCP<MueLu_UCAggregationFactory> aggFact = rcp(new MueLu_UCAggregationFactory());
      aggFact->SetMinNodesPerAggregate(minAggSize);
      fm->SetFactory("Aggregates", aggFact);

      //turn off damping
      typedef MueLu::SaPFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MueLu_SaPFactory;
      if (plainAgg) {
        RCP<MueLu_SaPFactory> sapFactory = rcp(new MueLu_SaPFactory);
        sapFactory->SetDampingFactor( (Scalar) 0.0 );
        fm->SetFactory("P", sapFactory);
      }

      H->Setup(*fm);
    }

    // Setup Belos solver
    RCP<ParameterList> belosParams = rcp(new ParameterList);
   


    belosParams->set("Flexible Gmres", false);

    belosParams->set("Num Blocks", 500);//20
    belosParams->set("Convergence Tolerance", solver_tol);
    belosParams->set("Maximum Iterations", 1000);
    belosParams->set("Verbosity", 33);
    belosParams->set("Output Style", 1);
    belosParams->set("Output Frequency", 1);
    typedef Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> MV;
    typedef Belos::OperatorT<Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > OP;
    typedef Belos::OperatorTraits<Scalar,MV,OP> BOPT;
    typedef Belos::MultiVecTraits<Scalar,MV> BMVT;
    typedef Belos::MultiVecTraits<double,MV> BTMVT;
    typedef Belos::LinearProblem<double,MV,OP> BLinProb;
    typedef Belos::XpetraOp<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> BXpetraOp;
    RCP<OP> belosJ = rcp(new BXpetraOp(xopJ)); // Turns an Xpetra::Operator object into a Belos operator
    RCP< BLinProb > problem = rcp(new BLinProb(belosJ, dx, f));
    if (prec_method != NONE)
      problem->setRightPrec(M);
    problem->setProblem();
    RCP<Belos::SolverManager<double,MV,OP> > solver;
    if (solver_method == CG)
      solver = rcp(new Belos::PseudoBlockCGSolMgr<double,MV,OP>(problem, belosParams));
    else if (solver_method == GMRES)
      solver = rcp(new Belos::BlockGmresSolMgr<double,MV,OP>(problem, belosParams));
    

    // Print initial residual norm
    std::vector<double> norm_f(1);
    //BMVT::MvNorm(*f, norm_f);
    BTMVT::MvNorm(*f, norm_f);
    if (MyPID == 0)
      std::cout << "\nInitial residual norm = " << norm_f[0] << std::endl;

    // Solve linear system
    Belos::ReturnType ret = solver->solve();

    if (MyPID == 0) {
      if (ret == Belos::Converged)
        std::cout << "Solver converged!" << std::endl;
      else
        std::cout << "Solver failed to converge!" << std::endl;
    }

    // Update x
    x->update(-1.0, *dx, 1.0);
    Writer::writeDenseFile("stochastic_solution.mm", x);

    // Compute new residual & response function
    RCP<Tpetra_Vector> g = Tpetra::createVector<Scalar>(model->get_g_map(0));
    f->putScalar(0.0);
    model->computeResidual(*x, *p, *f);
    model->computeResponse(*x, *p, *g);

    // Print final residual norm
    //BMVT::MvNorm(*f, norm_f);
    BTMVT::MvNorm(*f, norm_f);
    if (MyPID == 0)
      std::cout << "\nFinal residual norm = " << norm_f[0] << std::endl;

    // Print response
    std::cout << "\nResponse =      " << std::endl;
    //Writer::writeDense(std::cout, g);
    Writer::writeDenseFile("stochastic_residual.mm", f);















/*
    double g_mean = g->get1dView()[0].mean();
    double g_std_dev = g->get1dView()[0].standard_deviation();
    std::cout << "g mean = " << g_mean << std::endl;
    std::cout << "g std_dev = " << g_std_dev << std::endl;
    bool passed = false;
    if (norm_f[0] < 1.0e-10 &&
        std::abs(g_mean-g_mean_exp) < g_tol &&
        std::abs(g_std_dev - g_std_dev_exp) < g_tol)
      passed = true;
    if (MyPID == 0) {
      if (passed)
        std::cout << "Example Passed!" << std::endl;
      else{
        std::cout << "Example Failed!" << std::endl;
        std::cout << "expected g_mean = "<< g_mean_exp << std::endl;
        std::cout << "expected g_std_dev = "<< g_std_dev_exp << std::endl;
      }
    }
*/






    }



    Teuchos::TimeMonitor::summarize(std::cout);
    Teuchos::TimeMonitor::zeroOutTimers();

  }
  
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }
  catch (string& s) {
    std::cout << s << std::endl;
  }
  catch (char *s) {
    std::cout << s << std::endl;
  }
  catch (...) {
    std::cout << "Caught unknown exception!" <<std:: endl;
  }

#ifdef HAVE_MPI
  MPI_Finalize() ;
#endif

}
Example #10
0
int main(int argc, char *argv[])
{
    int np=1, rank=0;
    int splitrank, splitsize;
    int rc = 0;
    nssi_service xfer_svc;

    int server_index=0;
    int rank_in_server=0;

    int transport_index=-1;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &np);

    MPI_Barrier(MPI_COMM_WORLD);

    Teuchos::oblackholestream blackhole;
    std::ostream &out = ( rank == 0 ? std::cout : blackhole );

    struct xfer_args args;

    const int num_io_methods = 8;
    const int io_method_vals[] = {
            XFER_WRITE_ENCODE_SYNC, XFER_WRITE_ENCODE_ASYNC,
            XFER_WRITE_RDMA_SYNC, XFER_WRITE_RDMA_ASYNC,
            XFER_READ_ENCODE_SYNC, XFER_READ_ENCODE_ASYNC,
            XFER_READ_RDMA_SYNC, XFER_READ_RDMA_ASYNC};
    const char * io_method_names[] = {
            "write-encode-sync", "write-encode-async",
            "write-rdma-sync", "write-rdma-async",
            "read-encode-sync", "read-encode-async",
            "read-rdma-sync", "read-rdma-async"};

    const int nssi_transport_list[] = {
            NSSI_RPC_PTL,
            NSSI_RPC_PTL,
            NSSI_RPC_IB,
            NSSI_RPC_IB,
            NSSI_RPC_GEMINI,
            NSSI_RPC_GEMINI,
            NSSI_RPC_BGPDCMF,
            NSSI_RPC_BGPDCMF,
            NSSI_RPC_BGQPAMI,
            NSSI_RPC_BGQPAMI,
            NSSI_RPC_MPI};

    const int num_nssi_transports = 11;
    const int nssi_transport_vals[] = {
            0,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10
            };
    const char * nssi_transport_names[] = {
            "portals",
            "ptl",
            "infiniband",
            "ib",
            "gemini",
            "gni",
            "bgpdcmf",
            "dcmf",
            "bgqpami",
            "pami",
            "mpi"
    };


    // Initialize arguments
    args.transport=NSSI_DEFAULT_TRANSPORT;
    args.len = 1;
    args.delay = 1;
    args.io_method = XFER_WRITE_RDMA_SYNC;
    args.debug_level = LOG_WARN;
    args.num_trials = 1;
    args.num_reqs = 1;
    args.result_file_mode = "a";
    args.result_file = "";
    args.url_file = "";
    args.logfile = "";
    args.client_flag = true;
    args.server_flag = true;
    args.num_servers = 1;
    args.num_threads = 0;
    args.timeout = 500;
    args.num_retries = 5;
    args.validate_flag = true;
    args.kill_server_flag = true;
    args.block_distribution = true;


    bool success = true;

    /**
     * We make extensive use of the \ref Teuchos::CommandLineProcessor for command-line
     * options to control the behavior of the test code.   To evaluate performance,
     * the "num-trials", "num-reqs", and "len" options control the amount of data transferred
     * between client and server.  The "io-method" selects the type of data transfer.  The
     * server-url specifies the URL of the server.  If running as a server, the server-url
     * provides a recommended URL when initializing the network transport.
     */
    try {

        //out << Teuchos::Teuchos_Version() << std::endl << std::endl;

        // Creating an empty command line processor looks like:
        Teuchos::CommandLineProcessor parser;
        parser.setDocString(
                "This example program demonstrates a simple data-transfer service "
                "built using the NEtwork Scalable Service Interface (Nessie)."
        );

        /* To set and option, it must be given a name and default value.  Additionally,
           each option can be given a help std::string.  Although it is not necessary, a help
           std::string aids a users comprehension of the acceptable command line arguments.
           Some examples of setting command line options are:
         */

        parser.setOption("delay", &args.delay, "time(s) for client to wait for server to start" );
        parser.setOption("timeout", &args.timeout, "time(ms) to wait for server to respond" );
        parser.setOption("server", "no-server", &args.server_flag, "Run the server" );
        parser.setOption("client", "no-client", &args.client_flag, "Run the client");
        parser.setOption("len", &args.len, "The number of structures in an input buffer");
        parser.setOption("debug",(int*)(&args.debug_level), "Debug level");
        parser.setOption("logfile", &args.logfile, "log file");
        parser.setOption("num-trials", &args.num_trials, "Number of trials (experiments)");
        parser.setOption("num-reqs", &args.num_reqs, "Number of reqs/trial");
        parser.setOption("result-file", &args.result_file, "Where to store results");
        parser.setOption("result-file-mode", &args.result_file_mode, "Write mode for the result");
        parser.setOption("server-url-file", &args.url_file, "File that has URL client uses to find server");
        parser.setOption("validate", "no-validate", &args.validate_flag, "Validate the data");
        parser.setOption("num-servers", &args.num_servers, "Number of server processes");
        parser.setOption("num-threads", &args.num_threads, "Number of threads used by each server process");
        parser.setOption("kill-server", "no-kill-server", &args.kill_server_flag, "Kill the server at the end of the experiment");
        parser.setOption("block-distribution", "rr-distribution", &args.block_distribution,
                "Use a block distribution scheme to assign clients to servers");

        // Set an enumeration command line option for the io_method
        parser.setOption("io-method", &args.io_method, num_io_methods, io_method_vals, io_method_names,
                "I/O Methods for the example: \n"
                "\t\t\twrite-encode-sync : Write data through the RPC args, synchronous\n"
                "\t\t\twrite-encode-async: Write data through the RPC args - asynchronous\n"
                "\t\t\twrite-rdma-sync : Write data using RDMA (server pulls) - synchronous\n"
                "\t\t\twrite-rdma-async: Write data using RDMA (server pulls) - asynchronous\n"
                "\t\t\tread-encode-sync : Read data through the RPC result - synchronous\n"
                "\t\t\tread-encode-async: Read data through the RPC result - asynchronous\n"
                "\t\t\tread-rdma-sync : Read data using RDMA (server puts) - synchronous\n"
                "\t\t\tread-rdma-async: Read data using RDMA (server puts) - asynchronous");


        // Set an enumeration command line option for the NNTI transport
        parser.setOption("transport", &transport_index, num_nssi_transports, nssi_transport_vals, nssi_transport_names,
                "NSSI transports (not all are available on every platform): \n"
                "\t\t\tportals|ptl    : Cray or Schutt\n"
                "\t\t\tinfiniband|ib  : libibverbs\n"
                "\t\t\tgemini|gni     : Cray libugni (Gemini or Aries)\n"
                "\t\t\tbgpdcmf|dcmf   : IBM BG/P DCMF\n"
                "\t\t\tbgqpami|pami   : IBM BG/Q PAMI\n"
                "\t\t\tmpi            : isend/irecv implementation\n"
                );



        /* There are also two methods that control the behavior of the
           command line processor.  First, for the command line processor to
           allow an unrecognized a command line option to be ignored (and
           only have a warning printed), use:
         */
        parser.recogniseAllOptions(true);

        /* Second, by default, if the parser finds a command line option it
           doesn't recognize or finds the --help option, it will throw an
           std::exception.  If you want prevent a command line processor from
           throwing an std::exception (which is important in this program since
           we don't have an try/catch around this) when it encounters a
           unrecognized option or help is printed, use:
         */
        parser.throwExceptions(false);

        /* We now parse the command line where argc and argv are passed to
           the parse method.  Note that since we have turned off std::exception
           throwing above we had better grab the return argument so that
           we can see what happened and act accordingly.
         */
        Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv );

        if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) {
            return 0;
        }

        if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL   ) {
            return 1; // Error!

        }

        // Here is where you would use these command line arguments but for this example program
        // we will just print the help message with the new values of the command-line arguments.
        //if (rank == 0)
        //    out << "\nPrinting help message with new values of command-line arguments ...\n\n";

        //parser.printHelpMessage(argv[0],out);

    }

    TEUCHOS_STANDARD_CATCH_STATEMENTS(true,std::cerr,success);

    log_debug(args.debug_level, "transport_index=%d", transport_index);
    if (transport_index > -1) {
    	args.transport     =nssi_transport_list[transport_index];
    	args.transport_name=std::string(nssi_transport_names[transport_index]);
    }
	args.io_method_name=std::string(io_method_names[args.io_method]);

    log_debug(args.debug_level, "%d: Finished processing arguments", rank);


    if (!success) {
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    if (!args.server_flag && args.client_flag) {
        /* initialize logger */
        if (args.logfile.empty()) {
            logger_init(args.debug_level, NULL);
        } else {
            char fn[1024];
            sprintf(fn, "%s.client.%03d.log", args.logfile.c_str(), rank);
            logger_init(args.debug_level, fn);
        }
    } else if (args.server_flag && !args.client_flag) {
        /* initialize logger */
        if (args.logfile.empty()) {
            logger_init(args.debug_level, NULL);
        } else {
            char fn[1024];
            sprintf(fn, "%s.server.%03d.log", args.logfile.c_str(), rank);
            logger_init(args.debug_level, fn);
        }
    } else if (args.server_flag && args.client_flag) {
        /* initialize logger */
        if (args.logfile.empty()) {
            logger_init(args.debug_level, NULL);
        } else {
            char fn[1024];
            sprintf(fn, "%s.%03d.log", args.logfile.c_str(), rank);
            logger_init(args.debug_level, fn);
        }
    }

    log_level debug_level = args.debug_level;

    // Communicator used for both client and server (may split if using client and server)
    MPI_Comm comm;

    log_debug(debug_level, "%d: Starting xfer-service test", rank);

#ifdef TRIOS_ENABLE_COMMSPLITTER
    if (args.transport == NSSI_RPC_MPI) {
        MPI_Pcontrol(0);
    }
#endif

    /**
     * Since this test can be run as a server, client, or both, we need to play some fancy
     * MPI games to get the communicators working correctly.  If we're executing as both
     * a client and a server, we split the communicator so that the client thinks its
     * running by itself.
     */
    int color = 0;  // color=0-->server, color=1-->client
    if (args.client_flag && args.server_flag) {
        if (np < 2) {
            log_error(debug_level, "Must use at least 2 MPI processes for client and server mode");
            MPI_Abort(MPI_COMM_WORLD, -1);
        }

        // Split the communicators. Put all the servers as the first ranks.
        if (rank < args.num_servers) {
            color = 0;
            log_debug(debug_level, "rank=%d is a server", rank);
        }
        else {
            color = 1;  // all others are clients
            log_debug(debug_level, "rank=%d is a client", rank);
        }

        MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm);
    }
    else {
        if (args.client_flag) {
            color=1;
            log_debug(debug_level, "rank=%d is a client", rank);
        }
        else if (args.server_flag) {
            color=0;
            log_debug(debug_level, "rank=%d is a server", rank);
        }
        else {
            log_error(debug_level, "Must be either a client or a server");
            MPI_Abort(MPI_COMM_WORLD, -1);
        }
        MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm);
    }

    MPI_Comm_rank(comm, &splitrank);
    MPI_Comm_size(comm, &splitsize);

    log_debug(debug_level, "%d: Finished splitting communicators", rank);

    /**
     * Initialize the Nessie interface by specifying a transport, encoding scheme, and a
     * recommended URL.  \ref NSSI_DEFAULT_TRANSPORT is usually the best choice, since it
     * is often the case that only one type of transport exists on a particular platform.
     * Currently supported transports are \ref NSSI_RPC_PTL, \ref NSSI_RPC_GNI, and
     * \ref NSSI_RPC_IB.  We only support one type of encoding scheme so NSSI_DEFAULT_ENCODE
     * should always be used for the second argument.   The URL can be specified (as we did for
     * the server, or NULL (as we did for the client).  This is a recommended value.  Use the
     * \ref nssi_get_url function to find the actual value.
     */
    nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, NULL);

    // Get the Server URL
    std::string my_url(NSSI_URL_LEN, '\0');
    nssi_get_url((nssi_rpc_transport)args.transport, &my_url[0], NSSI_URL_LEN);

    // If running as both client and server, gather and distribute
    // the server URLs to all the clients.
    if (args.server_flag && args.client_flag) {

        std::string all_urls;

        // This needs to be a vector of chars, not a string
        all_urls.resize(args.num_servers * NSSI_URL_LEN, '\0');

        // Have servers gather their URLs
        if (color == 0) {
            assert(args.num_servers == splitsize);  // these should be equal

            log_debug(debug_level, "%d: Gathering urls: my_url=%s", rank, my_url.c_str());

            // gather all urls to rank 0 of the server comm (also rank 0 of MPI_COMM_WORLD)
            MPI_Gather(&my_url[0], NSSI_URL_LEN, MPI_CHAR,
                    &all_urls[0], NSSI_URL_LEN, MPI_CHAR, 0, comm);
        }

        // broadcast the full set of server urls to all processes
        MPI_Bcast(&all_urls[0], all_urls.size(), MPI_CHAR, 0, MPI_COMM_WORLD);

        log_debug(debug_level, "%d: Bcast urls, urls.size=%d", rank, all_urls.size());

        if (color == 1) {

            // For block distribution scheme use the utility function (in xfer_util.cpp)
            if (args.block_distribution) {
                // Use this utility function to calculate the server_index
                xfer_block_partition(args.num_servers, splitsize, splitrank, &server_index, &rank_in_server);
            }

            // Use a simple round robin distribution scheme
            else {
                server_index   = splitrank % args.num_servers;
                rank_in_server = splitrank / args.num_servers;
            }

            // Copy the server url out of the list of urls
            int offset = server_index * NSSI_URL_LEN;

            args.server_url = all_urls.substr(offset, NSSI_URL_LEN);

            log_debug(debug_level, "client %d assigned to server \"%s\"", splitrank, args.server_url.c_str());
        }


        log_debug(debug_level, "%d: Finished distributing server urls, server_url=%s", rank, args.server_url.c_str());
    }

    // If running as a client only, have to get the list of servers from the urlfile.
    else if (!args.server_flag && args.client_flag){

        sleep(args.delay);  // give server time to get started

        std::vector< std::string > urlbuf;
        xfer_read_server_url_file(args.url_file.c_str(), urlbuf, comm);
        args.num_servers = urlbuf.size();

        // For block distribution scheme use the utility function (in xfer_util.cpp)
        if (args.block_distribution) {
            // Use this utility function to calculate the server_index
            xfer_block_partition(args.num_servers, splitsize, splitrank, &server_index, &rank_in_server);
        }

        // Use a simple round robin distribution scheme
        else {
            server_index   = splitrank % args.num_servers;
            rank_in_server = splitrank / args.num_servers;
        }

        args.server_url = urlbuf[server_index];
        log_debug(debug_level, "client %d assigned to server \"%s\"", splitrank, args.server_url.c_str());
    }

    else if (args.server_flag && !args.client_flag) {
        args.server_url = my_url;

        if (args.url_file.empty()) {
            log_error(debug_level, "Must set --url-file");
            MPI_Abort(MPI_COMM_WORLD, -1);
        }

        xfer_write_server_url_file(args.url_file.c_str(), my_url.c_str(), comm);
    }

    // Set the debug level for the xfer service.
    xfer_debug_level = args.debug_level;

    // Print the arguments after they've all been set.
    log_debug(debug_level, "%d: server_url=%s", rank, args.server_url.c_str());

    print_args(out, args, "%");

    log_debug(debug_level, "server_url=%s", args.server_url.c_str());

    //------------------------------------------------------------------------------
    /** If we're running this job with a server, the server always executes on node 0.
     *  In this example, the server is a single process.
     */
    if (color == 0) {
        rc = xfer_server_main((nssi_rpc_transport)args.transport, args.num_threads, comm);
        log_debug(debug_level, "Server is finished");
    }

    // ------------------------------------------------------------------------------
     /**  The parallel client will execute this branch.  The root node, node 0, of the client connects
      *   connects with the server, using the \ref nssi_get_service function.  Then the root
      *   broadcasts the service description to the other clients before starting the main
      *   loop of the client code by calling \ref xfer_client_main.
      */
    else {
        int i;
        int client_rank;

        // get rank within the client communicator
        MPI_Comm_rank(comm, &client_rank);

        nssi_init((nssi_rpc_transport)args.transport);

        // Only one process needs to connect to the service
        // TODO: Make get_service a collective call (some transports do not need a connection)
        //if (client_rank == 0) {
        {


            // connect to remote server
            for (i=0; i < args.num_retries; i++) {
                log_debug(debug_level, "Try to connect to server: attempt #%d, url=%s", i, args.server_url.c_str());
                rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url.c_str(), args.timeout, &xfer_svc);
                if (rc == NSSI_OK)
                    break;
                else if (rc != NSSI_ETIMEDOUT) {
                    log_error(xfer_debug_level, "could not get svc description: %s",
                            nssi_err_str(rc));
                    break;
                }
            }
        }

        // wait for all the clients to connect
        MPI_Barrier(comm);

        //MPI_Bcast(&rc, 1, MPI_INT, 0, comm);

        if (rc == NSSI_OK) {
            if (client_rank == 0) log_debug(debug_level, "Connected to service on attempt %d\n", i);

            // Broadcast the service description to the other clients
            //log_debug(xfer_debug_level, "Bcasting svc to other clients");
            //MPI_Bcast(&xfer_svc, sizeof(nssi_service), MPI_BYTE, 0, comm);

            log_debug(debug_level, "Starting client main");
            // Start the client code
            xfer_client_main(args, xfer_svc, comm);


            MPI_Barrier(comm);

            // Tell one of the clients to kill the server
            if ((args.kill_server_flag) && (rank_in_server == 0)) {
                log_debug(debug_level, "%d: Halting xfer service", rank);
                rc = nssi_kill(&xfer_svc, 0, 5000);
            }
            rc=nssi_free_service((nssi_rpc_transport)args.transport, &xfer_svc);
            if (rc != NSSI_OK) {
                log_error(xfer_debug_level, "could not free svc description: %s",
                        nssi_err_str(rc));
            }
        }

        else {
            if (client_rank == 0)
                log_error(debug_level, "Failed to connect to service after %d attempts: ABORTING", i);
            success = false;
            //MPI_Abort(MPI_COMM_WORLD, -1);
        }

        nssi_fini((nssi_rpc_transport)args.transport);

    }

    log_debug(debug_level, "%d: clean up nssi", rank);
    MPI_Barrier(MPI_COMM_WORLD);

    // Clean up nssi_rpc
    rc = nssi_rpc_fini((nssi_rpc_transport)args.transport);
    if (rc != NSSI_OK)
        log_error(debug_level, "Error in nssi_rpc_fini");

    log_debug(debug_level, "%d: MPI_Finalize()", rank);
    MPI_Finalize();

    logger_fini();

    if(success && (rc == NSSI_OK))
    	out << "\nEnd Result: TEST PASSED" << std::endl;
    else
    	out << "\nEnd Result: TEST FAILED" << std::endl;

    return ((success && (rc==NSSI_OK)) ? 0 : 1 );
}
Example #11
0
int main(int argc, char *argv[])
{
  bool success = true;
  bool verbose = false;
  try {

    const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
    const size_t num_cores_per_socket =
      Kokkos::hwloc::get_available_cores_per_numa();
    const size_t num_threads_per_core =
      Kokkos::hwloc::get_available_threads_per_core();

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This test performance of MP::Vector multiply routines.\n");
    int nGrid = 32;
    CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
    int nIter = 10;
    CLP.setOption("ni", &nIter, "Number of multiply iterations");
#ifdef KOKKOS_HAVE_PTHREAD
    bool threads = true;
    CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
    int num_cores = num_cores_per_socket * num_sockets;
    CLP.setOption("cores", &num_cores,
                  "Number of CPU cores to use (defaults to all)");
    int num_hyper_threads = num_threads_per_core;
    CLP.setOption("hyperthreads", &num_hyper_threads,
                  "Number of hyper threads per core to use (defaults to all)");
    int threads_per_vector = 1;
    CLP.setOption("threads_per_vector", &threads_per_vector,
                  "Number of threads to use within each vector");
#endif
#ifdef KOKKOS_HAVE_CUDA
    bool cuda = true;
    CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
    int cuda_threads_per_vector = 16;
    CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
                  "Number of Cuda threads to use within each vector");
    int cuda_block_size = 0;
    CLP.setOption("cuda_block_size", &cuda_block_size,
                  "Cuda block size (0 implies the default choice)");
    int num_cuda_blocks = 0;
    CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
                  "Number of Cuda blocks (0 implies the default choice)");
    int device_id = 0;
    CLP.setOption("device", &device_id, "CUDA device ID");
#endif
    CLP.parse( argc, argv );

    typedef int Ordinal;
    typedef double Scalar;

#ifdef KOKKOS_HAVE_PTHREAD
    if (threads) {
      typedef Kokkos::Threads Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Threads::initialize(num_cores*num_hyper_threads);

      std::cout << std::endl
                << "Threads performance with " << num_cores*num_hyper_threads
                << " threads:" << std::endl;

      Kokkos::DeviceConfig dev_config(num_cores,
                                       threads_per_vector,
                                       num_hyper_threads / threads_per_vector);

      mainHost<Storage>(nGrid, nIter, dev_config);

      Kokkos::Threads::finalize();
    }
#endif

#ifdef KOKKOS_HAVE_CUDA
    if (cuda) {
      typedef Kokkos::Cuda Device;
      typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage;

      Kokkos::Cuda::host_mirror_device_type::initialize();
      Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));

      cudaDeviceProp deviceProp;
      cudaGetDeviceProperties(&deviceProp, device_id);
      std::cout << std::endl
                << "CUDA performance for device " << device_id << " ("
                << deviceProp.name << "):"
                << std::endl;

      Kokkos::DeviceConfig dev_config(
        num_cuda_blocks,
        cuda_threads_per_vector,
        cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);

      mainCuda<Storage>(nGrid,nIter,dev_config);

      Kokkos::Cuda::host_mirror_device_type::finalize();
      Kokkos::Cuda::finalize();
    }
#endif

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("This example program measure the performance of IChol algorithms on Kokkos::Threads execution space.\n");

  int nthreads = 1;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int max_task_dependence = 10;
  clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence");

  int team_size = 1;
  clp.setOption("team-size", &team_size, "Team size");

  bool team_interface = false;
  clp.setOption("enable-team-interface", "disable-team-interface",
                &team_interface, "Flag for team interface");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int niter = 10;
  clp.setOption("niter", &niter, "Number of iterations for testing");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;
  
  int r_val = 0;
  {
    const bool overwrite = true;
    const int nshepherds = (team_interface ? nthreads/team_size : nthreads);
    const int nworker_per_shepherd = nthreads/nshepherds;

    setenv("QT_HWPAR",                    to_string(nthreads).c_str(),             overwrite);
    setenv("QT_NUM_SHEPHERDS",            to_string(nshepherds).c_str(),           overwrite);
    setenv("QT_NUM_WORKERS_PER_SHEPHERD", to_string(nworker_per_shepherd).c_str(), overwrite);

    exec_space::initialize(nthreads);
    exec_space::print_configuration(cout, true);
    
    // r_val = exampleICholPerformance
    //   <value_type,ordinal_type,size_type,exec_space,void>
    //   (file_input, niter, nthreads, max_task_dependence, team_size, team_interface, (nthreads != 1), verbose);

    exec_space::finalize();

    unsetenv("QT_HWPAR");
    unsetenv("QT_NUM_SHEPHERDS");
    unsetenv("QT_NUM_WORKERS_PER_SHEPHERD");
  }

  return r_val;
}
Example #13
0
/// \brief Parse command-line options for this test
///
/// \param argc [in] As usual in C(++)
/// \param argv [in] As usual in C(++)
/// \param allowedToPrint [in] Whether this (MPI) process is allowed
///   to print to stdout/stderr.  Different per (MPI) process.
/// \param printedHelp [out] Whether this (MPI) process printed the
///   "help" display (summary of command-line options)
///
/// \return Encapsulation of command-line options 
static DistTsqrTestParameters
parseOptions (int argc, 
	      char* argv[], 
	      const bool allowedToPrint, 
	      bool& printedHelp)
{
  using std::cerr;
  using std::endl;

  printedHelp = false;

  // Command-line parameters, set to their default values.
  DistTsqrTestParameters params;
  try {
    Teuchos::CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, 
					       /* recognizeAllOptions=*/ true);
    cmdLineProc.setDocString (docString);
    cmdLineProc.setOption ("verify",
			   "noverify",
			   &params.verify,
			   "Test accuracy");
    cmdLineProc.setOption ("benchmark",
			   "nobenchmark",
			   &params.benchmark,
			   "Test performance");
    cmdLineProc.setOption ("implicit",
			   "noimplicit",
			   &params.testFactorImplicit,
			   "Test DistTsqr\'s factor() and explicit_Q()");
    cmdLineProc.setOption ("explicit",
			   "noexplicit",
			   &params.testFactorExplicit,
			   "Test DistTsqr\'s factorExplicit()");
    cmdLineProc.setOption ("print-matrices", 
			   "no-print-matrices", 
			   &params.printMatrices, 
			   "Print global test matrices and computed results to stderr");
    cmdLineProc.setOption ("debug", 
			   "nodebug", 
			   &params.debug, 
			   "Print debugging information");
    cmdLineProc.setOption ("human-readable",
			   "machine-readable",
			   &params.humanReadable,
			   "If set, make output easy to read by humans "
			   "(but hard to parse)");
    cmdLineProc.setOption ("ncols", 
			   &params.numCols, 
			   "Number of columns in the test matrix");
    cmdLineProc.setOption ("ntrials", 
			   &params.numTrials, 
			   "Number of trials (only used when \"--benchmark\"");
    cmdLineProc.setOption ("real", 
			   "noreal",
			   &params.testReal,
			   "Test real arithmetic routines");
#ifdef HAVE_TSQR_COMPLEX
    cmdLineProc.setOption ("complex", 
			   "nocomplex",
			   &params.testComplex,
			   "Test complex arithmetic routines");
#endif // HAVE_TSQR_COMPLEX
    cmdLineProc.parse (argc, argv);
  } 
  catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { 
    if (allowedToPrint)
      cerr << "Unrecognized command-line option: " << e.what() << endl;
    throw e;
  }
  catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { 
    printedHelp = true;
  } 

  // Validate command-line options.  We provide default values
  // for unset options, so we don't have to validate those.
  if (params.numCols <= 0)
    throw std::invalid_argument ("Number of columns must be positive");
  else if (params.benchmark && params.numTrials < 1)
    throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1");

  return params;
}
Example #14
0
int main(int argc, char **argv)
{
  try {

    // Initialize MPI
#ifdef HAVE_MPI
    MPI_Init(&argc,&argv);
#endif

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This example generates the sparsity pattern for the block stochastic Galerkin matrix.\n");
    int d = 3;
    CLP.setOption("dimension", &d, "Stochastic dimension");
    int p = 5;
    CLP.setOption("order", &p, "Polynomial order");
    double drop = 1.0e-15;
    CLP.setOption("drop", &drop, "Drop tolerance");
    std::string file = "A.mm";
    CLP.setOption("filename", &file, "Matrix Market filename");
    BasisType basis_type = LEGENDRE;
    CLP.setOption("basis", &basis_type, 
		  num_basis_types, basis_type_values, basis_type_names, 
		  "Basis type");
    bool full = true;
    CLP.setOption("full", "linear", &full, "Use full or linear expansion");
    bool use_old = false;
    CLP.setOption("old", "new", &use_old, "Use old or new Cijk algorithm");

    // Parse arguments
    CLP.parse( argc, argv );

    // Basis
    Teuchos::Array< Teuchos::RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); 
    for (int i=0; i<d; i++) {
      if (basis_type == HERMITE)
	bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>(p));
      else if (basis_type == LEGENDRE)
	bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>(p));
      else if (basis_type == RYS)
	bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>(p, 1.0, 
								  false));
    }
    Teuchos::RCP<const Stokhos::CompletePolynomialBasis<int,double> > basis = 
      Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>(bases,
								    drop,
								    use_old));

    // Triple product tensor
    Teuchos::RCP<Stokhos::Sparse3Tensor<int,double> > Cijk;
    if (full)
      Cijk = basis->computeTripleProductTensor(basis->size());
    else
      Cijk = basis->computeTripleProductTensor(basis->dimension()+1);

    std::cout << "basis size = " << basis->size() 
	      << " num nonzero Cijk entries = " << Cijk->num_entries() 
	      << std::endl;

#ifdef HAVE_MPI
    Epetra_MpiComm comm(MPI_COMM_WORLD);
#else
    Epetra_SerialComm comm;
#endif
    
    // Print triple product sparsity to matrix market file
    Stokhos::sparse3Tensor2MatrixMarket(*basis, *Cijk, comm, file);

    Teuchos::TimeMonitor::summarize(std::cout);
    
  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }

  return 0;
}
Example #15
0
int
main (int argc, char *argv[])
{
    // command-line arguments
    log_level debug_level = LOG_ERROR;
    string logfile("");

    int npes, me, i;

    int num_servers=1;
    int num_clients=1;

    int servers_per_node=1;
    int clients_per_node=1;

    int client_weight=10;
    int server_weight=10;
    int client_server_weight=5;

    string server_node_file("SNF.txt");
    string client_node_file("CNF.txt");

    const int num_graphs = 4;
    const int graph_vals[] = {
            GRAPH_COMPLETE,
            GRAPH_CLIENT_COMPLETE,
            GRAPH_SERVER_COMPLETE,
            GRAPH_CLIENT_SERVER_ONLY
    };
    const char * graph_names[] = {
            "complete",
            "client-complete",
            "server-complete",
            "client-server-only"
    };
    enum graph_connection_t graph_connection=GRAPH_COMPLETE;

    MPI_Init(&argc, &argv);

    try {
        Teuchos::CommandLineProcessor parser;

        // init parser
        parser.setDocString("Find node placement of server and client ranks");

        parser.setOption("strategy", &strategy, "LibTopoMap strategy (greedy, greedy_route, recursive, rcm, scotch, ascending)");
        parser.setOption("num-servers", (int *)(&num_servers), "Number of servers to place");
        parser.setOption("num-clients", (int *)(&num_clients), "Number of clients to place");
        parser.setOption("servers-per-node", (int *)(&servers_per_node), "Number of server ranks per compute node");
        parser.setOption("clients-per-node", (int *)(&clients_per_node), "Number of client ranks per compute node");
        parser.setOption("server-weight", (int *)(&server_weight), "Edge weight of server-to-server communication");
        parser.setOption("client-weight", (int *)(&client_weight), "Edge weight of client-to-client communication");
        parser.setOption("client-server-weight", (int *)(&client_server_weight), "Edge weight of client-to-server communication");
        parser.setOption("server-node-file", &server_node_file, "Where to write the server placement results");
        parser.setOption("client-node-file", &client_node_file, "Where to write the client placement results");
        parser.setOption("verbose", (int *)(&debug_level), "Debug level");
        parser.setOption("logfile", &logfile, "Path to file for debug statements");
        // Set an enumeration command line option for the connection graph
        parser.setOption("graph-connection", (int*)&graph_connection, num_graphs, graph_vals, graph_names,
                "Graph Connections for the example: \n"
                "\t\t\tcomplete : client-client graph is complete, server-server graph is complete\n"
                "\t\t\tclient-complete: client-client graph is complete, server-server graph is empty\n"
                "\t\t\tserver-complete : client-client graph is empty, server-server graph is complete\n"
                "\t\t\tclient-server-only: client-client graph is empty, server-server graph is empty\n"
                "\t\t\tIn all cases, each client has an edge to one of the servers\n"
                );

        parser.recogniseAllOptions();
        parser.throwExceptions();

        Teuchos::CommandLineProcessor::EParseCommandLineReturn
        parseReturn= parser.parse( argc, argv );
        if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) {
            return 0;
        }
        if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL   ) {
            return 1; // Error!
        }
    }
    catch (...) {
        exit(-1);
    }

    /* initialize the logger */
    logger_init(debug_level, logfile.c_str());

    MPI_Comm_size(MPI_COMM_WORLD, &npes);
    MPI_Comm_rank(MPI_COMM_WORLD, &me);

    if (me==0) {
        cout << " ----------------  ARGUMENTS --------------- " << std::endl;
        cout << " \tstrategy             = " << strategy << std::endl;
        cout << " \tgraph-connection     = " << graph_names[graph_connection] << std::endl;
        cout << " \tnum-servers          = " << num_servers << std::endl;
        cout << " \tnum-clients          = " << num_clients << std::endl;
        cout << " \tservers-per-node     = " << servers_per_node << std::endl;
        cout << " \tclients-per-node     = " << clients_per_node << std::endl;
        cout << " \tserver-weight        = " << server_weight << std::endl;
        cout << " \tclient-weight        = " << client_weight << std::endl;
        cout << " \tclient-server-weight = " << client_server_weight << std::endl;
        cout << " \tserver-node-file     = " << server_node_file << std::endl;
        cout << " \tclient-node-file     = " << client_node_file << std::endl;
        cout << " \tverbose              = " << debug_level << std::endl;
        cout << " \tlogfile              = " << logfile << std::endl;
        cout << " ------------------------------------------- " << std::endl;
    }
    MPI_Barrier(MPI_COMM_WORLD);

    int *rank_map=(int*)malloc(sizeof(int) * npes);
    int *nid_map=(int*)malloc(sizeof(int) * npes);

    construct_graph(
            rank_map,
            nid_map,
            num_servers,
            num_clients,
            servers_per_node,
            clients_per_node,
            server_weight,
            client_weight,
            client_server_weight,
            graph_connection,
            0);

    if (me == 0) {
        ofstream snf(server_node_file.c_str(), ios_base::out);
        ofstream cnf(client_node_file.c_str(), ios_base::out);

        for (i=0;i<npes;i++) {
            if (rank_map[i] < num_servers)
                snf << nid_map[i] << "\t" << i << "\t" << rank_map[i] << std::endl;
        }
        for (i=0;i<npes;i++) {
            if (rank_map[i] >= num_servers)
                cnf << nid_map[i] << "\t" << i << "\t" << rank_map[i] << std::endl;
        }

        snf.close();
        cnf.close();
    }

    MPI_Finalize();

    return 0;
}
int main(int argc, char **argv)
{
  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This example generates partitions the Cijk tensor for a lexicographic tree basis.\n");
    int d = 3;
    CLP.setOption("dimension", &d, "Stochastic dimension");
    int p = 5;
    CLP.setOption("order", &p, "Polynomial order");
    double drop = 1.0e-12;
    CLP.setOption("drop", &drop, "Drop tolerance");
    bool symmetric = true;
    CLP.setOption("symmetric", "asymmetric", &symmetric, "Use basis polynomials with symmetric PDF");
    int level = 1;
    CLP.setOption("level", &level, "Level to partition");
    bool save_3tensor = false;
    CLP.setOption("save_3tensor", "no-save_3tensor", &save_3tensor,
                  "Save full 3tensor to file");
    std::string file_3tensor = "Cijk.dat";
    CLP.setOption("filename_3tensor", &file_3tensor,
                  "Filename to store full 3-tensor");

    // Parse arguments
    CLP.parse( argc, argv );

    // Basis
    Array< RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d);
    const double alpha = 1.0;
    const double beta = symmetric ? 1.0 : 2.0 ;
    for (int i=0; i<d; i++) {
      bases[i] = Teuchos::rcp(new Stokhos::JacobiBasis<int,double>(
                                p, alpha, beta, true));
    }
    typedef Stokhos::LexographicLess< Stokhos::MultiIndex<int> > less_type;
    typedef Stokhos::TotalOrderBasis<int,double,less_type> basis_type;
    RCP<const basis_type> basis = Teuchos::rcp(new basis_type(bases, drop));

    // Build LTB Cijk
    typedef Stokhos::LTBSparse3Tensor<int,double> Cijk_LTB_type;
    typedef Cijk_LTB_type::CijkNode node_type;
    Teuchos::RCP<Cijk_LTB_type> Cijk =
      computeTripleProductTensorLTB(*basis, symmetric);

    int sz = basis->size();
    std::cout << "basis size = " << sz
              << " num nonzero Cijk entries = " << Cijk->num_entries()
              << std::endl;

    // Setup partitions
    Teuchos::Array< Teuchos::RCP<const node_type> > node_stack;
    Teuchos::Array< int > index_stack;
    node_stack.push_back(Cijk->getHeadNode());
    index_stack.push_back(0);
    Teuchos::RCP<const node_type> node;
    int child_index;
    Teuchos::Array< Teuchos::RCP<const node_type> > partition_stack;
    int my_level = 0;
    while (node_stack.size() > 0) {
      node = node_stack.back();
      child_index = index_stack.back();

      // Leaf -- If we got here, just push this node into the partitions
      if (node->is_leaf) {
        partition_stack.push_back(node);
        node_stack.pop_back();
        index_stack.pop_back();
        --my_level;
      }

      // Put nodes into partition if level matches
      else if (my_level == level) {
        partition_stack.push_back(node);
        node_stack.pop_back();
        index_stack.pop_back();
        --my_level;
      }

      // More children to process -- process them first
      else if (child_index < node->children.size()) {
        ++index_stack.back();
        node = node->children[child_index];
        node_stack.push_back(node);
        index_stack.push_back(0);
        ++my_level;
      }

      // No more children
      else {
        node_stack.pop_back();
        index_stack.pop_back();
        --my_level;
      }

    }

    // Print statistics
    int max_i_size = 0, max_j_size = 0, max_k_size = 0;
    for (int part=0; part<partition_stack.size(); ++part) {
      node = partition_stack[part];
      if (node->i_size > max_i_size) max_i_size = node->i_size;
      if (node->j_size > max_j_size) max_j_size = node->j_size;
      if (node->k_size > max_k_size) max_k_size = node->k_size;
    }
    std::cout << "num partitions = " << partition_stack.size() << std::endl
              << "max i size = " << max_i_size << std::endl
              << "max j size = " << max_j_size << std::endl
              << "max k size = " << max_k_size << std::endl;

    // Build flat list of (i,j,k,part) tuples
    typedef Stokhos::ProductBasisUtils::Cijk_1D_Iterator<int> Cijk_Iterator;
    Teuchos::Array< Teuchos::Array<int> > tuples;
     for (int part=0; part<partition_stack.size(); ++part) {
       node = partition_stack[part];
       node_stack.push_back(node);
       index_stack.push_back(0);
       while (node_stack.size() > 0) {
         node = node_stack.back();
         child_index = index_stack.back();

         // Leaf -- store (i,j,k,part) tuples
         if (node->is_leaf) {
           Cijk_Iterator cijk_iterator(node->p_i,
                                       node->p_j,
                                       node->p_k,
                                       symmetric);
           bool more = true;
           while (more) {
             Teuchos::Array<int> t(4);
             int I = node->i_begin + cijk_iterator.i;
             int J = node->j_begin + cijk_iterator.j;
             int K = node->k_begin + cijk_iterator.k;
             t[0] = I;
             t[1] = J;
             t[2] = K;
             t[3] = part;
             tuples.push_back(t);
              more = cijk_iterator.increment();
           }
           node_stack.pop_back();
           index_stack.pop_back();
         }

         // More children to process -- process them first
         else if (child_index < node->children.size()) {
           ++index_stack.back();
           node = node->children[child_index];
           node_stack.push_back(node);
           index_stack.push_back(0);
         }

         // No more children
         else {
           node_stack.pop_back();
           index_stack.pop_back();
         }

       }
    }

    // Print full 3-tensor to file
    if (save_3tensor) {
      std::ofstream cijk_file(file_3tensor.c_str());
      cijk_file.precision(14);
      cijk_file.setf(std::ios::scientific);
      cijk_file << "i, j, k, part" << std::endl;
      for (int i=0; i<tuples.size(); ++i) {
        cijk_file << tuples[i][0] << ", "
                  << tuples[i][1] << ", "
                  << tuples[i][2] << ", "
                  << tuples[i][3] << std::endl;
      }
      cijk_file.close();
    }

  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }

  return 0;
}
Example #17
0
int
main (int argc, char *argv[])
{
    int rc;

    // command-line arguments
    int retries = 0;
    int sig = 0;
    int timeout = 1000;
    log_level debug_level = LOG_ERROR;
    string logfile("");

    nssi_service svc;
    char my_url[NSSI_URL_LEN];

    std::string server_url("");
    char        server_str[NSSI_URL_LEN];
    std::string contact_file("");   /* the file where the server's url should be written */


    try {
        Teuchos::CommandLineProcessor parser;

        // init parser
        parser.setDocString("Kill an NSSI server");

        parser.setOption("verbose", (int *)(&debug_level), "Debug level.");
        parser.setOption("logfile", &logfile, "Path to file for debug statements");
        parser.setOption("server-url", &server_url, "URL of NSSI service");
        parser.setOption("contact-file", &contact_file, "Where to read the server's URL");
        parser.setOption("timeout", &timeout, "Timout for contacting services (ms)");
        parser.setOption("retries", &retries, "Number of times to retry before exiting");
        parser.setOption("sig", &sig, "Signal to use for the kill command");

        parser.recogniseAllOptions();
        parser.throwExceptions();

        Teuchos::CommandLineProcessor::EParseCommandLineReturn
        parseReturn= parser.parse( argc, argv );
        if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) {
            return 0;
        }
        if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL   ) {
            return 1; // Error!
        }
    }
    catch (...) {
        exit(-1);
    }

    /* initialize the logger */
    logger_init(debug_level, logfile.c_str());

    if (server_url.c_str()[0]=='\0') {
        sleep(1);
        log_debug(debug_level, "reading URL from file");
        read_contact_info(contact_file.c_str(), server_str, NSSI_URL_LEN);
    } else {
        log_debug(debug_level, "using URL from command-line");
        strcpy(server_str, server_url.c_str());
    }

    nssi_rpc_init(NSSI_DEFAULT_TRANSPORT, NSSI_DEFAULT_ENCODE, NULL);

    nssi_get_url(NSSI_DEFAULT_TRANSPORT, my_url, NSSI_URL_LEN);

//    sleep(1);
    log_info(debug_level, "\nTrying to get service at %s", server_str);

    rc=nssi_get_service(NSSI_DEFAULT_TRANSPORT, server_str, timeout, &svc);
    if (rc != NSSI_OK) {
        log_error(admin_debug_level, "could not get svc description: %s",
                nssi_err_str(rc));
        return rc;
    }
    rc = kill_svc(&svc, sig, timeout);
    if (rc == NSSI_ETIMEDOUT) {
        fprintf(stderr, "Timed out trying to kill (%s)\n",
                server_url.c_str());
        return rc;
    }
    else if (rc != NSSI_OK) {
        log_error(admin_debug_level, "failed to kill service: %s",
                nssi_err_str(rc));
        return rc;
    }

    nssi_rpc_fini(NSSI_DEFAULT_TRANSPORT);

    return 0;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("Tacho::DenseMatrixBase examples on Pthreads execution space.\n");

  int nthreads = 0;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  // int numa = 0;
  // clp.setOption("numa", &numa, "Number of numa node");

  // int core_per_numa = 0;
  // clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  std::string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int treecut = 0;
  clp.setOption("treecut", &treecut, "Level to cut tree from bottom");

  int prunecut = 0;
  clp.setOption("prunecut", &prunecut, "Level to prune tree from bottom");

  int fill_level = -1;
  clp.setOption("fill-level", &fill_level, "Fill level");

  int rows_per_team = 4096;
  clp.setOption("rows-per-team", &rows_per_team, "Workset size");

  int max_concurrency = 250000;
  clp.setOption("max-concurrency", &max_concurrency, "Max number of concurrent tasks");

  int max_task_dependence = 3;
  clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence");

  int team_size = 1;
  clp.setOption("team-size", &team_size, "Team size");

  int nrhs = 1;
  clp.setOption("nrhs", &team_size, "# of right hand side");

  int mb = 0;
  clp.setOption("mb", &mb, "Dense nested blocks size");

  int nb = 1;
  clp.setOption("nb", &nb, "Column block size of right hand side");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

  int r_val = 0;
  {
    exec_space::initialize(nthreads);

#if (defined(HAVE_SHYLUTACHO_SCOTCH) && defined(HAVE_SHYLUTACHO_CHOLMOD))
    r_val = exampleCholSuperNodesByBlocks<exec_space>
      (file_input, 
       treecut, prunecut, fill_level, rows_per_team, 
       max_concurrency, max_task_dependence, team_size,
       nrhs, mb, nb,
       verbose);
#else
    r_val = -1;
    std::cout << "Scotch or Cholmod is NOT configured in Trilinos" << std::endl;
#endif

    exec_space::finalize();
  }
  
  return r_val;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("This example program show blockwise information on Kokkos::Serial execution space.\n");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int fill_level = 0;
  clp.setOption("fill-level", &fill_level, "Fill level");

  int league_size = 1;
  clp.setOption("league-size", &league_size, "League size");

  int treecut = 15;
  clp.setOption("treecut", &treecut, "Level to cut tree from bottom");

  int minblksize = 0;
  clp.setOption("minblksize", &minblksize, "Minimum block size for internal reordering");

  int prunecut = 0;
  clp.setOption("prunecut", &prunecut, "Level to prune the tree from bottom");

  int seed = 0;
  clp.setOption("seed", &seed, "Seed for random number generator in graph partition");

  int histogram_size = 0;
  clp.setOption("histogram-size", &histogram_size, "Histogram size");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;
  
  int r_val = 0;
  {
    Kokkos::initialize();
    
    r_val = exampleStatByBlocks
      <value_type,ordinal_type,size_type,exec_space,void>
      (file_input,                                                                    
       treecut,                                                                          
       minblksize,
       prunecut,
       seed,                                                                             
       fill_level,                                                                       
       league_size,                                                                      
       histogram_size,                                                                      
       verbose);
    
    Kokkos::finalize();
  }

  return r_val;
}
Example #20
0
int main(int argc, char *argv[])
{
  typedef int                 IndexType;
  typedef double              ValueType;
  typedef cusp::device_memory MemorySpace;
  //typedef cusp::row_major     Orientation;

  bool success = true;
  bool verbose = false;
  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString("This test performance of block multiply routines.\n");
    IndexType n = 32;
    CLP.setOption("n", &n, "Number of mesh points in the each direction");
    IndexType nrhs_begin = 32;
    CLP.setOption("begin", &nrhs_begin,
                  "Staring number of right-hand-sides");
    IndexType nrhs_end = 512;
    CLP.setOption("end", &nrhs_end,
                  "Ending number of right-hand-sides");
    IndexType nrhs_step = 32;
    CLP.setOption("step", &nrhs_step,
                  "Increment in number of right-hand-sides");
    IndexType nits = 10;
    CLP.setOption("nits", &nits,
                  "Number of multiply iterations");
    int device_id = 0;
    CLP.setOption("device", &device_id, "CUDA device ID");
    CLP.parse( argc, argv );

    // Set CUDA device
    cudaSetDevice(device_id);
    cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);

    // create 3D Poisson problem
    cusp::csr_matrix<IndexType, ValueType, MemorySpace> A;
    cusp::gallery::poisson27pt(A, n, n, n);

    std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , "
              << "col_time , col_gflops" << std::endl;

    for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) {

      double flops =
        2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs);

      // test row-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row(
        A.num_rows, nrhs, 0);

      cusp::detail::timer row_timer;
      row_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_row, y_row);
      }
      cudaDeviceSynchronize();
      double row_time = row_timer.seconds_elapsed() / nits;
      double row_gflops = 1.0e-9 * flops / row_time;

      // test column-major storage
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col(
        A.num_rows, nrhs, 1);
      cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col(
        A.num_rows, nrhs, 0);

      cusp::detail::timer col_timer;
      col_timer.start();
      for (IndexType iter=0; iter<nits; ++iter) {
        cusp::MVmultiply(A, x_col, y_col);
      }
      cudaDeviceSynchronize();
      double col_time = col_timer.seconds_elapsed() / nits;
      double col_gflops = 1.0e-9 * flops / col_time;

      std::cout << nrhs << " , "
                << A.num_rows << " , "  << A.num_entries << " , "
                << row_time << " , " << row_gflops << " , "
                << col_time << " , " << col_gflops
                << std::endl;

    }

  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

  if (success)
    return 0;
  return -1;
}
Example #21
0
int main(int argc, char *argv[])
{
    int np=1, rank=0;
    int splitrank, splitsize;
    int rc = 0;
    nssi_service multicast_svc[2];

    int transport_index=-1;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &np);

    MPI_Barrier(MPI_COMM_WORLD);

    Teuchos::oblackholestream blackhole;
    std::ostream &out = ( rank == 0 ? std::cout : blackhole );

    struct multicast_args args;

    const int num_io_methods = 6;
    const int io_method_vals[] = {
            MULTICAST_EMPTY_REQUEST_SYNC, MULTICAST_EMPTY_REQUEST_ASYNC,
            MULTICAST_GET_SYNC,           MULTICAST_GET_ASYNC,
            MULTICAST_PUT_SYNC,           MULTICAST_PUT_ASYNC};
    const char * io_method_names[] = {
            "empty-request-sync", "empty-request-async",
            "get-sync",           "get-async",
            "put-sync",           "put-async"};

    const int nssi_transport_list[] = {
            NSSI_RPC_PTL,
            NSSI_RPC_PTL,
            NSSI_RPC_IB,
            NSSI_RPC_IB,
            NSSI_RPC_GEMINI,
            NSSI_RPC_GEMINI,
            NSSI_RPC_BGPDCMF,
            NSSI_RPC_BGPDCMF,
            NSSI_RPC_BGQPAMI,
            NSSI_RPC_BGQPAMI,
            NSSI_RPC_MPI};

    const int num_nssi_transports = 11;
    const int nssi_transport_vals[] = {
            0,
            1,
            2,
            3,
            4,
            5,
            6,
            7,
            8,
            9,
            10
            };
    const char * nssi_transport_names[] = {
            "portals",
            "ptl",
            "infiniband",
            "ib",
            "gemini",
            "gni",
            "bgpdcmf",
            "dcmf",
            "bgqpami",
            "pami",
            "mpi"
    };


    // Initialize arguments
    args.transport=NSSI_DEFAULT_TRANSPORT;
    args.delay = 1;
    args.io_method = MULTICAST_EMPTY_REQUEST_SYNC;
    args.debug_level = LOG_WARN;
    args.num_trials = 1;
    args.num_reqs = 1;
    args.len = 1;
    args.result_file_mode = "a";
    args.result_file = "";
    args.url_file[0] = "";
    args.url_file[1] = "";
    args.logfile = "";
    args.client_flag = true;
    args.server_flag = true;
    args.timeout = 500;
    args.num_retries = 5;
    args.validate_flag = true;
    args.server_url[0] = "";
    args.server_url[1] = "";

    bool success = true;

    /**
     * We make extensive use of the \ref Teuchos::CommandLineProcessor for command-line
     * options to control the behavior of the test code.   To evaluate performance,
     * the "num-trials", "num-reqs", and "len" options control the amount of data transferred
     * between client and server.  The "io-method" selects the type of data transfer.  The
     * server-url specifies the URL of the server.  If running as a server, the server-url
     * provides a recommended URL when initializing the network transport.
     */
    try {

        //out << Teuchos::Teuchos_Version() << std::endl << std::endl;

        // Creating an empty command line processor looks like:
        Teuchos::CommandLineProcessor parser;
        parser.setDocString(
                "This example program demonstrates a simple data-transfer service "
                "built using the NEtwork Scalable Service Interface (Nessie)."
        );

        /* To set and option, it must be given a name and default value.  Additionally,
           each option can be given a help std::string.  Although it is not necessary, a help
           std::string aids a users comprehension of the acceptable command line arguments.
           Some examples of setting command line options are:
         */

        parser.setOption("delay", &args.delay, "time(s) for client to wait for server to start" );
        parser.setOption("timeout", &args.timeout, "time(ms) to wait for server to respond" );
        parser.setOption("server", "no-server", &args.server_flag, "Run the server" );
        parser.setOption("client", "no-client", &args.client_flag, "Run the client");
        parser.setOption("len", &args.len, "The number of structures in an input buffer");
        parser.setOption("debug",(int*)(&args.debug_level), "Debug level");
        parser.setOption("logfile", &args.logfile, "log file");
        parser.setOption("num-trials", &args.num_trials, "Number of trials (experiments)");
        parser.setOption("num-reqs", &args.num_reqs, "Number of reqs/trial");
        parser.setOption("result-file", &args.result_file, "Where to store results");
        parser.setOption("result-file-mode", &args.result_file_mode, "Write mode for the result");
        parser.setOption("server-url-1", &args.server_url[0], "URL client uses to find the server 1");
        parser.setOption("server-url-2", &args.server_url[1], "URL client uses to find the server 2");
        parser.setOption("server-url-file-1", &args.url_file[0], "File that has URL client uses to find server 1");
        parser.setOption("server-url-file-2", &args.url_file[1], "File that has URL client uses to find server 2");
        parser.setOption("validate", "no-validate", &args.validate_flag, "Validate the data");

        // Set an enumeration command line option for the io_method

        parser.setOption("io-method", &args.io_method, num_io_methods, io_method_vals, io_method_names,
                "I/O Methods for the example: \n"
                "\t\t\tempty-request-sync : Send an empty request - synchronous\n"
                "\t\t\tempty-request-async: Send an empty request - asynchronous\n"
                "\t\t\tget-sync : Servers pull data from client - synchronous\n"
                "\t\t\tget-async: Servers pull data from client - asynchronous\n"
                "\t\t\tput-sync : Servers push data from client - synchronous\n"
                "\t\t\tput-async: Servers push data from client - asynchronous"
                );

        // Set an enumeration command line option for the NNTI transport
        parser.setOption("transport", &transport_index, num_nssi_transports, nssi_transport_vals, nssi_transport_names,
                "NSSI transports (not all are available on every platform): \n"
                "\t\t\tportals|ptl    : Cray or Schutt\n"
                "\t\t\tinfiniband|ib  : libibverbs\n"
                "\t\t\tgemini|gni     : Cray libugni (Gemini or Aries)\n"
                "\t\t\tbgpdcmf|dcmf   : IBM BG/P DCMF\n"
                "\t\t\tbgqpami|pami   : IBM BG/Q PAMI\n"
                "\t\t\tmpi            : isend/irecv implementation\n"
                );



        /* There are also two methods that control the behavior of the
           command line processor.  First, for the command line processor to
           allow an unrecognized a command line option to be ignored (and
           only have a warning printed), use:
         */
        parser.recogniseAllOptions(true);

        /* Second, by default, if the parser finds a command line option it
           doesn't recognize or finds the --help option, it will throw an
           std::exception.  If you want prevent a command line processor from
           throwing an std::exception (which is important in this program since
           we don't have an try/catch around this) when it encounters a
           unrecognized option or help is printed, use:
         */
        parser.throwExceptions(false);

        /* We now parse the command line where argc and argv are passed to
           the parse method.  Note that since we have turned off std::exception
           throwing above we had better grab the return argument so that
           we can see what happened and act accordingly.
         */
        Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv );

        if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) {
            return 0;
        }

        if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL   ) {
            return 1; // Error!

        }

        // Here is where you would use these command line arguments but for this example program
        // we will just print the help message with the new values of the command-line arguments.
        //if (rank == 0)
        //    out << "\nPrinting help message with new values of command-line arguments ...\n\n";

        //parser.printHelpMessage(argv[0],out);

    }

    TEUCHOS_STANDARD_CATCH_STATEMENTS(true,std::cerr,success);

    log_debug(LOG_ALL, "transport_index=%d", transport_index);
    if (transport_index > -1) {
    	args.transport     =nssi_transport_list[transport_index];
    	args.transport_name=std::string(nssi_transport_names[transport_index]);
    }
    args.io_method_name=io_method_names[args.io_method];

    log_debug(args.debug_level, "%d: Finished processing arguments", rank);


    if (!success) {
        MPI_Abort(MPI_COMM_WORLD, 1);
    }


    if (!args.server_flag && args.client_flag) {
        /* initialize logger */
        if (args.logfile.empty()) {
            logger_init(args.debug_level, NULL);
        } else {
            char fn[1024];
            sprintf(fn, "%s.client.%03d.log", args.logfile.c_str(), rank);
            logger_init(args.debug_level, fn);
        }
    } else if (args.server_flag && !args.client_flag) {
        /* initialize logger */
        if (args.logfile.empty()) {
            logger_init(args.debug_level, NULL);
        } else {
            char fn[1024];
            sprintf(fn, "%s.server.%03d.log", args.logfile.c_str(), rank);
            logger_init(args.debug_level, fn);
        }
    } else if (args.server_flag && args.client_flag) {
        /* initialize logger */
        if (args.logfile.empty()) {
            logger_init(args.debug_level, NULL);
        } else {
            char fn[1024];
            sprintf(fn, "%s.%03d.log", args.logfile.c_str(), rank);
            logger_init(args.debug_level, fn);
        }
    }

    log_level debug_level = args.debug_level;

    // Communicator used for both client and server (may split if using client and server)
    MPI_Comm comm;

    log_debug(debug_level, "%d: Starting multicast-service test", rank);

    /**
     * Since this test can be run as a server, client, or both, we need to play some fancy
     * MPI games to get the communicators working correctly.  If we're executing as both
     * a client and a server, we split the communicator so that the client thinks its
     * running by itself.
     */
    if (args.client_flag && args.server_flag) {
        if (np < 3) {
            log_error(debug_level, "Must use at least 3 MPI processes for client and server mode");
            MPI_Abort(MPI_COMM_WORLD, -1);
        }

        // Split the communicators. Processors with color=0 are servers.

        int color = ((rank == 0)||(rank == 1)) ? 0 : 1; // two server
        MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm);

        MPI_Comm_rank(comm, &splitrank);
        MPI_Comm_size(comm, &splitsize);

        //    std::cout << "rank=" << rank << "/" << np << ", color=" << color <<
        //            ", new_rank=" << newrank << "/" << newsize << std::endl << std::endl;
        //
        //    std::cout << "my_url=" << my_url <<  ", server_url=" << args.server_url << std::endl;
    }
    else {
        MPI_Comm_dup(MPI_COMM_WORLD, &comm);
    }

    /**
     * Initialize the Nessie interface by specifying a transport, encoding scheme, and a
     * recommended URL.  \ref NSSI_DEFAULT_TRANSPORT is usually the best choice, since it
     * is often the case that only one type of transport exists on a particular platform.
     * Currently supported transports are \ref NSSI_RPC_PTL, \ref NSSI_RPC_GNI, and
     * \ref NSSI_RPC_IB.  We only support one type of encoding scheme so NSSI_DEFAULT_ENCODE
     * should always be used for the second argument.   The URL can be specified (as we did for
     * the server, or NULL (as we did for the client).  This is a recommended value.  Use the
     * \ref nssi_get_url function to find the actual value.
     */
    if (args.server_flag && !args.server_url[rank].empty()) {
        // use the server URL as suggested URL
        nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, args.server_url[rank].c_str());
    }
    else {
        nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, NULL);
    }

    // Get the Server URL
    std::string my_url(NSSI_URL_LEN, '\0');
    nssi_get_url((nssi_rpc_transport)args.transport, &my_url[0], NSSI_URL_LEN);

    // Broadcast the server URL to all the clients
    args.server_url[0].resize(NSSI_URL_LEN, '\0');
    args.server_url[1].resize(NSSI_URL_LEN, '\0');
    if (args.server_flag && args.client_flag) {
        args.server_url[0] = my_url;
        MPI_Bcast(&args.server_url[0][0], args.server_url[0].size(), MPI_CHAR, 0, MPI_COMM_WORLD);
        args.server_url[1] = my_url;
        MPI_Bcast(&args.server_url[1][0], args.server_url[1].size(), MPI_CHAR, 1, MPI_COMM_WORLD);
    }

    else if (!args.server_flag && args.client_flag){
        if (args.server_url[0].empty()) {

            // check to see if we're supposed to get the URL from a file
            if (!args.url_file[0].empty()) {
                // Fetch the server URL from a file
                sleep(1);
                log_debug(debug_level, "Reading from file %s", args.url_file[0].c_str());
                std::ifstream urlfile (args.url_file[0].c_str());
                if (urlfile.is_open()) {
                    if (urlfile.good())
                        getline(urlfile, args.server_url[0]);
                }
                else {
                    log_error(debug_level, "Failed to open server_url_file=%s", args.url_file[0].c_str());
                    exit(1);
                }
                urlfile.close();
                log_debug(debug_level, "URL = %s", args.server_url[0].c_str());
            }
            else {
                log_error(debug_level, "Need to set --server-url-1=[ADDR] or --server-url-file-1=[PATH]");
            }
        }
        if (args.server_url[1].empty()) {

            // check to see if we're supposed to get the URL from a file
            if (!args.url_file[1].empty()) {
                // Fetch the server URL from a file
                sleep(1);
                log_debug(debug_level, "Reading from file %s", args.url_file[1].c_str());
                std::ifstream urlfile (args.url_file[1].c_str());
                if (urlfile.is_open()) {
                    if (urlfile.good())
                        getline(urlfile, args.server_url[1]);
                }
                else {
                    log_error(debug_level, "Failed to open server_url_file=%s", args.url_file[1].c_str());
                    exit(1);
                }
                urlfile.close();
                log_debug(debug_level, "URL = %s", args.server_url[1].c_str());
            }
            else {
                log_error(debug_level, "Need to set --server-url-1=[ADDR] or --server-url-file-1=[PATH]");
            }
        }
    }

    else if (args.server_flag && !args.client_flag) {
        args.server_url[0] = my_url;
        // If the url_file value is set, write the url to a file
        if (!args.url_file[0].empty()) {
            std::ofstream urlfile (args.url_file[0].c_str());
            if (urlfile.is_open()) {
                urlfile << args.server_url[0].c_str() << std::endl;
            }
            urlfile.close();
            log_debug(debug_level, "Wrote url to file %s", args.url_file[0].c_str());
        }

        args.server_url[1] = my_url;
        // If the url_file value is set, write the url to a file
        if (!args.url_file[1].empty()) {
            std::ofstream urlfile (args.url_file[1].c_str());
            if (urlfile.is_open()) {
                urlfile << args.server_url[1].c_str() << std::endl;
            }
            urlfile.close();
            log_debug(debug_level, "Wrote url to file %s", args.url_file[1].c_str());
        }
    }



    // Set the debug level for the multicast service.
    multicast_debug_level = args.debug_level;

    // Print the arguments after they've all been set.
    print_args(out, args, "%");


    //------------------------------------------------------------------------------
    /** If we're running this job with a server, the server always executes on nodes 0 and 1.
     *  In this example, the server is two process.
     */
    if (args.server_flag && ((rank == 0)|(rank == 1))) {
        rc = multicast_server_main(args, comm);
        log_debug(debug_level, "Server is finished");
    }

    // ------------------------------------------------------------------------------
     /**  The parallel client will execute this branch.  The root node, nodes 0 and 1, of the client connects
      *   connects with the server, using the \ref nssi_get_service function.  Then the root
      *   broadcasts the service description to the other clients before starting the main
      *   loop of the client code by calling \ref multicast_client_main.
      */
    else {
        int i;
        int client_rank;

        // get rank within the client communicator
        MPI_Comm_rank(comm, &client_rank);

        nssi_init((nssi_rpc_transport)args.transport);

        // Only one process needs to connect to the service
        // TODO: Make get_service a collective call (some transports do not need a connection)
        //if (client_rank == 0) {
        {

            sleep(args.delay);  // give server time to get started

            // connect to remote server
            for (i=0; i < args.num_retries; i++) {
                log_debug(debug_level, "Try to connect to server: attempt #%d", i);
                rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url[0].c_str(), args.timeout, &multicast_svc[0]);
                if (rc == NSSI_OK)
                    break;
                else if (rc != NSSI_ETIMEDOUT) {
                    log_error(multicast_debug_level, "could not get svc description: %s",
                            nssi_err_str(rc));
                    break;
                }
            }
            // connect to remote server
            for (i=0; i < args.num_retries; i++) {
                log_debug(debug_level, "Try to connect to server: attempt #%d", i);
                rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url[1].c_str(), args.timeout, &multicast_svc[1]);
                if (rc == NSSI_OK)
                    break;
                else if (rc != NSSI_ETIMEDOUT) {
                    log_error(multicast_debug_level, "could not get svc description: %s",
                            nssi_err_str(rc));
                    break;
                }
            }
        }

        //MPI_Bcast(&rc, 1, MPI_INT, 0, comm);

        if (rc == NSSI_OK) {
            if (client_rank == 0) log_debug(debug_level, "Connected to service on attempt %d\n", i);

            // Broadcast the service description to the other clients
            //log_debug(multicast_debug_level, "Bcasting svc to other clients");
            //MPI_Bcast(&multicast_svc, sizeof(nssi_service), MPI_BYTE, 0, comm);

            log_debug(debug_level, "Starting client main");
            // Start the client code
            multicast_client_main(args, &multicast_svc[0], comm);


            MPI_Barrier(comm);

            // Tell one of the clients to kill the server
            if (client_rank == 0) {
                log_debug(debug_level, "%d: Halting multicast service", rank);
                rc = nssi_kill(&multicast_svc[0], 0, 5000);
                rc = nssi_kill(&multicast_svc[1], 0, 5000);
            }
        }

        else {
            if (client_rank == 0)
                log_error(debug_level, "Failed to connect to service after %d attempts: ABORTING", i);
            success = false;
            //MPI_Abort(MPI_COMM_WORLD, -1);
        }

        nssi_fini((nssi_rpc_transport)args.transport);

    }

    log_debug(debug_level, "%d: clean up nssi", rank);
    MPI_Barrier(MPI_COMM_WORLD);

    // Clean up nssi_rpc
    rc = nssi_rpc_fini((nssi_rpc_transport)args.transport);
    if (rc != NSSI_OK)
        log_error(debug_level, "Error in nssi_rpc_fini");

    log_debug(debug_level, "%d: MPI_Finalize()", rank);
    MPI_Finalize();

    logger_fini();

    if(success && (rc == NSSI_OK))
      out << "\nEnd Result: TEST PASSED" << std::endl;
    else
        out << "\nEnd Result: TEST FAILED" << std::endl;

    return ((success && (rc==NSSI_OK)) ? 0 : 1 );
}
Example #22
0
int main(int argc, char *argv[])
{
  // Create output stream. (Handy for multicore output.)
  auto out = Teuchos::VerboseObjectBase::getDefaultOStream();

  Teuchos::GlobalMPISession session(&argc, &argv, NULL);

  auto comm = Teuchos::DefaultComm<int>::getComm();

  // Wrap the whole code in a big try-catch-statement.
  bool success = true;
  try {
    // =========================================================================
    // Handle command line arguments.
    // Boost::program_options is somewhat more complete here (e.g. you can
    // specify options without the "--" syntax), but it isn't less complicated
    // to use. Stick with Teuchos for now.
    Teuchos::CommandLineProcessor myClp;

    myClp.setDocString(
      "Numerical parameter continuation for nonlinear Schr\"odinger equations.\n"
    );

    std::string xmlInputPath = "";
    myClp.setOption("xml-input-file", &xmlInputPath,
                    "XML file containing the parameter list", true );

    // Print warning for unrecognized arguments and make sure to throw an
    // exception if something went wrong.
    //myClp.throwExceptions(false);
    //myClp.recogniseAllOptions ( true );

    // Finally, parse the command line.
    myClp.parse(argc, argv);

    // Retrieve Piro parameter list from given file.
    std::shared_ptr<Teuchos::ParameterList> piroParams(
        new Teuchos::ParameterList()
        );
    Teuchos::updateParametersFromXmlFile(
        xmlInputPath,
        Teuchos::rcp(piroParams).ptr()
        );
    // =======================================================================
    // Extract the location of input and output files.
    const Teuchos::ParameterList outputList =
      piroParams->sublist("Output", true);

    // Set default directory to be the directory of the XML file itself
    const std::string xmlDirectory =
      xmlInputPath.substr(0, xmlInputPath.find_last_of( "/" ) + 1);

    // By default, take the current directory.
    std::string prefix = "./";
    if (!xmlDirectory.empty())
      prefix = xmlDirectory + "/";

    const std::string outputDirectory = prefix;

    const std::string contFilePath =
      prefix + outputList.get<std::string>("Continuation data file name");

    Teuchos::ParameterList & inputDataList = piroParams->sublist("Input", true);

    const std::string inputExodusFile =
      prefix + inputDataList.get<std::string>("File");
    const int step = inputDataList.get<int>("Initial Psi Step");

    //const bool useBordering = piroParams->get<bool>("Bordering");
    // =======================================================================
    // Read the data from the file.
    auto mesh = std::make_shared<Nosh::StkMesh>(
        Teuchos::get_shared_ptr(comm),
        inputExodusFile,
        step
        );

    // Cast the data into something more accessible.
    auto psi = mesh->getComplexVector("psi");
    //psi->Random();

    // Set the output directory for later plotting with this.
    std::stringstream outputFile;
    outputFile << outputDirectory << "/solution.e";
    mesh->openOutputChannel(outputFile.str());

    // Create a parameter map from the initial parameter values.
    Teuchos::ParameterList initialParameterValues =
      piroParams->sublist("Initial parameter values", true);

    // Check if we need to interpret the time value stored in the file
    // as a parameter.
    const std::string & timeName =
      piroParams->get<std::string>("Interpret time as", "");
    if (!timeName.empty()) {
      initialParameterValues.set(timeName, mesh->getTime());
    }

    // Explicitly set the initial parameter value for this list.
    const std::string & paramName =
      piroParams->sublist( "LOCA" )
      .sublist( "Stepper" )
      .get<std::string>("Continuation Parameter");
    *out << "Setting the initial parameter value of \""
         << paramName << "\" to " << initialParameterValues.get<double>(paramName) << "." << std::endl;
    piroParams->sublist( "LOCA" )
    .sublist( "Stepper" )
    .set("Initial Value", initialParameterValues.get<double>(paramName));

    // Set the thickness field.
    auto thickness = std::make_shared<Nosh::ScalarField::Constant>(*mesh, 1.0);

    // Some alternatives for the positive-definite operator.
    // (a) -\Delta (Laplace operator with Neumann boundary)
    //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> matrixBuilder =
    //  rcp(new Nosh::ParameterMatrix::Laplace(mesh, thickness));

    // (b) (-i\nabla-A)^2 (Kinetic energy of a particle in magnetic field)
    // (b1) 'A' explicitly given in file.
    const double mu = initialParameterValues.get<double>("mu");
    auto mvp = std::make_shared<Nosh::VectorField::ExplicitValues>(*mesh, "A", mu);

    //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> keoBuilder(
    //    new Nosh::ParameterMatrix::Keo(mesh, thickness, mvp)
    //    );
    //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> DKeoDPBuilder(
    //    new Nosh::ParameterMatrix::DKeoDP(mesh, thickness, mvp, "mu")
    //    );

    // (b2) 'A' analytically given (here with constant curl).
    //      Optionally add a rotation axis u. This is important
    //      if continuation happens as a rotation of the vector
    //      field around an axis.
    //const std::shared_ptr<DoubleVector> b = rcp(new DoubleVector(3));
    //std::shared_ptr<Teuchos::SerialDenseVector<int,double> > u = Teuchos::null;
    //if ( piroParams->isSublist("Rotation vector") )
    //{
    //    u = rcp(new Teuchos::SerialDenseVector<int,double>(3));
    //    Teuchos::ParameterList & rotationVectorList =
    //        piroParams->sublist( "Rotation vector", false );
    //    (*u)[0] = rotationVectorList.get<double>("x");
    //    (*u)[1] = rotationVectorList.get<double>("y");
    //    (*u)[2] = rotationVectorList.get<double>("z");
    //}
    //std::shared_ptr<Nosh::VectorField::Virtual> mvp =
    //  rcp(new Nosh::VectorField::ConstantCurl(mesh, b, u));
    //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> matrixBuilder =
    //  rcp(new Nosh::ParameterMatrix::Keo(mesh, thickness, mvp));
    // (b3) 'A' analytically given in a class you write yourself, derived
    //      from Nosh::ParameterMatrix::Virtual.
    // [...]
    //
    // Setup the scalar potential V.
    // (a) A constant potential.
    //std::shared_ptr<Nosh::ScalarField::Virtual> sp =
    //rcp(new Nosh::ScalarField::Constant(*mesh, -1.0));
    //const double T = initialParameterValues.get<double>("T");
    // (b) With explicit values.
    //std::shared_ptr<Nosh::ScalarField::Virtual> sp =
    //rcp(new Nosh::ScalarField::ExplicitValues(*mesh, "V"));
    // (c) One you built yourself by deriving from Nosh::ScalarField::Virtual.
    auto sp = std::make_shared<MyScalarField>(mesh);


    const double g = initialParameterValues.get<double>("g");
    // Finally, create the model evaluator.
    // This is the most important object in the whole stack.
    auto modelEvaluator = std::make_shared<Nosh::ModelEvaluator::Nls>(
        mesh,
        mvp,
        sp,
        g,
        thickness,
        psi,
        "mu"
        );

    // Build the Piro model evaluator. It's used to hook up with
    // several different backends (NOX, LOCA, Rhythmos,...).
    std::shared_ptr<Thyra::ModelEvaluator<double>> piro;

    // Declare the eigensaver; it will be used only for LOCA solvers, though.
    std::shared_ptr<Nosh::SaveEigenData> glEigenSaver;

    // Switch by solver type.
    std::string & solver = piroParams->get<std::string>("Piro Solver");

    if (solver == "NOX") {
      auto observer = std::make_shared<Nosh::Observer>(modelEvaluator);

      piro = std::make_shared<Piro::NOXSolver<double>>(
            Teuchos::rcp(piroParams),
            Teuchos::rcp(modelEvaluator),
            Teuchos::rcp(observer)
            );
    } else if (solver == "LOCA") {
      auto observer = std::make_shared<Nosh::Observer>(
          modelEvaluator,
          contFilePath,
          piroParams->sublist("LOCA")
          .sublist("Stepper")
          .get<std::string>("Continuation Parameter")
          );

      // Setup eigen saver.
#ifdef HAVE_LOCA_ANASAZI
      bool computeEigenvalues = piroParams->sublist( "LOCA" )
                                .sublist( "Stepper" )
                                .get<bool>("Compute Eigenvalues");
      if (computeEigenvalues) {
        Teuchos::ParameterList & eigenList = piroParams->sublist("LOCA")
                                             .sublist("Stepper")
                                             .sublist("Eigensolver");
        std::string eigenvaluesFilePath =
          xmlDirectory + "/" + outputList.get<std::string> ( "Eigenvalues file name" );

        glEigenSaver = std::make_shared<Nosh::SaveEigenData>(
            eigenList,
            modelEvaluator,
            eigenvaluesFilePath
            );

        std::shared_ptr<LOCA::SaveEigenData::AbstractStrategy>
          glSaveEigenDataStrategy = glEigenSaver;
        eigenList.set("Save Eigen Data Method",
                      "User-Defined");
        eigenList.set("User-Defined Save Eigen Data Name",
                      "glSaveEigenDataStrategy");
        eigenList.set("glSaveEigenDataStrategy",
                      glSaveEigenDataStrategy);
      }
#endif
      // Get the solver.
      std::shared_ptr<Piro::LOCASolver<double>> piroLOCASolver(
          new Piro::LOCASolver<double>(
            Teuchos::rcp(piroParams),
            Teuchos::rcp(modelEvaluator),
            Teuchos::null
            //Teuchos::rcp(observer)
            )
          );

//      // Get stepper and inject it into the eigensaver.
//      std::shared_ptr<LOCA::Stepper> stepper = Teuchos::get_shared_ptr(
//          piroLOCASolver->getLOCAStepperNonConst()
//          );
//#ifdef HAVE_LOCA_ANASAZI
//      if (computeEigenvalues)
//        glEigenSaver->setLocaStepper(stepper);
//#endif
      piro = piroLOCASolver;
    }
#if 0
    else if ( solver == "Turning Point" ) {
      std::shared_ptr<Nosh::Observer> observer;

      Teuchos::ParameterList & bifList =
        piroParams->sublist("LOCA").sublist("Bifurcation");

      // Fetch the (approximate) null state.
      auto nullstateZ = mesh->getVector("null");

      // Set the length normalization vector to be the initial null vector.
      TEUCHOS_ASSERT(nullstateZ);
      auto lengthNormVec = Teuchos::rcp(new NOX::Thyra::Vector(*nullstateZ));
      //lengthNormVec->init(1.0);
      bifList.set("Length Normalization Vector", lengthNormVec);

      // Set the initial null vector.
      auto initialNullAbstractVec =
        Teuchos::rcp(new NOX::Thyra::Vector(*nullstateZ));
      // initialNullAbstractVec->init(1.0);
      bifList.set("Initial Null Vector", initialNullAbstractVec);

      piro = std::make_shared<Piro::LOCASolver<double>>(
            Teuchos::rcp(piroParams),
            Teuchos::rcp(modelEvaluator),
            Teuchos::null
            //Teuchos::rcp(observer)
            );
    }
#endif
    else {
      TEUCHOS_TEST_FOR_EXCEPT_MSG(
          true,
          "Unknown solver type \"" << solver << "\"."
          );
    }
    // ----------------------------------------------------------------------

    // Now the setting of inputs and outputs.
    Thyra::ModelEvaluatorBase::InArgs<double> inArgs = piro->createInArgs();
    inArgs.set_p(
        0,
        piro->getNominalValues().get_p(0)
        );

    // Set output arguments to evalModel call.
    Thyra::ModelEvaluatorBase::OutArgs<double> outArgs = piro->createOutArgs();

    // Now solve the problem and return the responses.
    const Teuchos::RCP<Teuchos::Time> piroSolveTime =
      Teuchos::TimeMonitor::getNewTimer("Piro total solve time");;
    {
      Teuchos::TimeMonitor tm(*piroSolveTime);
      piro->evalModel(inArgs, outArgs);
    }

    // Manually release LOCA stepper.
#ifdef HAVE_LOCA_ANASAZI
    if (glEigenSaver)
      glEigenSaver->releaseLocaStepper();
#endif

    // Print timing data.
    Teuchos::TimeMonitor::summarize();
  } catch (Teuchos::CommandLineProcessor::HelpPrinted) {
  } catch (Teuchos::CommandLineProcessor::ParseError) {
  }
  TEUCHOS_STANDARD_CATCH_STATEMENTS(true, *out, success);

  return success ? EXIT_SUCCESS : EXIT_FAILURE;
}
int main(int argc, char* argv[]) {
  int ierr = 0;

  try {
    double t, ta;
    int p = 2;
    int w = p+7;

    // Set up command line options
    Teuchos::CommandLineProcessor clp;
    clp.setDocString("This program tests the speed of various forward mode AD implementations for a single multiplication operation");
    int nderiv = 10;
    clp.setOption("nderiv", &nderiv, "Number of derivative components");
    int nloop = 1000000;
    clp.setOption("nloop", &nloop, "Number of loops");

    // Parse options
    Teuchos::CommandLineProcessor::EParseCommandLineReturn
      parseReturn= clp.parse(argc, argv);
    if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
      return 1;

    // Memory pool & manager
    Sacado::Fad::MemPoolManager<double> poolManager(10);
    Sacado::Fad::MemPool* pool = poolManager.getMemoryPool(nderiv);
    Sacado::Fad::DMFad<double>::setDefaultPool(pool);

    std::cout.setf(std::ios::scientific);
    std::cout.precision(p);
    std::cout << "Times (sec) for nderiv = " << nderiv
	      << " nloop =  " << nloop << ":  " << std::endl;

    ta = do_time_analytic(nderiv, nloop);
    std::cout << "Analytic:  " << std::setw(w) << ta << std::endl;

    t = do_time< FAD::TFad<10,double> >(nderiv, nloop);
    std::cout << "TFad:      " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< FAD::Fad<double> >(nderiv, nloop);
    std::cout << "Fad:       " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::Fad::SFad<double,10> >(nderiv, nloop);
    std::cout << "SFad:      " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::Fad::SLFad<double,10> >(nderiv, nloop);
    std::cout << "SLFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::Fad::DFad<double> >(nderiv, nloop);
    std::cout << "DFad:      " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::Fad::DMFad<double> >(nderiv, nloop);
    std::cout << "DMFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::ELRFad::SFad<double,10> >(nderiv, nloop);
    std::cout << "ELRSFad:   " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::ELRFad::SLFad<double,10> >(nderiv, nloop);
    std::cout << "ELRSLFad:  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::ELRFad::DFad<double> >(nderiv, nloop);
    std::cout << "ELRDFad:   " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::CacheFad::DFad<double> >(nderiv, nloop);
    std::cout << "CacheFad:  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

    t = do_time< Sacado::Fad::DVFad<double> >(nderiv, nloop);
    std::cout << "DVFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl;

  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
    ierr = 1;
  }
  catch (const char *s) {
    std::cout << s << std::endl;
    ierr = 1;
  }
  catch (...) {
    std::cout << "Caught unknown exception!" << std::endl;
    ierr = 1;
  }

  return ierr;
}
Example #24
0
int main(int argc, char *argv[])
{
    bool success = true;
    bool verbose = false;
    try {

        const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
        const size_t num_cores_per_socket =
            Kokkos::hwloc::get_available_cores_per_numa();
        const size_t num_threads_per_core =
            Kokkos::hwloc::get_available_threads_per_core();

        // Setup command line options
        Teuchos::CommandLineProcessor CLP;
        CLP.setDocString(
            "This test performance of MP::Vector multiply routines.\n");
        int nGrid = 32;
        CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
        int nIter = 10;
        CLP.setOption("ni", &nIter, "Number of multiply iterations");
        int ensemble_min = 4;
        CLP.setOption("emin", &ensemble_min, "Staring ensemble size");
        int ensemble_max = 24;
        CLP.setOption("emax", &ensemble_max, "Stoping ensemble size");
        int ensemble_step = 4;
        CLP.setOption("estep", &ensemble_step, "Ensemble increment");
#ifdef KOKKOS_HAVE_PTHREAD
        bool threads = true;
        CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
        int num_cores = num_cores_per_socket * num_sockets;
        CLP.setOption("cores", &num_cores,
                      "Number of CPU cores to use (defaults to all)");
        int num_hyper_threads = num_threads_per_core;
        CLP.setOption("hyperthreads", &num_hyper_threads,
                      "Number of hyper threads per core to use (defaults to all)");
#endif
#ifdef KOKKOS_HAVE_CUDA
        bool cuda = true;
        CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
        int device_id = 0;
        CLP.setOption("device", &device_id, "CUDA device ID");
#endif
        CLP.parse( argc, argv );

        typedef int Ordinal;
        typedef double Scalar;

#ifdef KOKKOS_HAVE_PTHREAD
        if (threads) {
            typedef Kokkos::Threads Device;

            Kokkos::Threads::initialize(num_cores*num_hyper_threads);

            std::cout << std::endl
                      << "Threads performance with " << num_cores*num_hyper_threads
                      << " threads:" << std::endl;

            performance_test_driver<Scalar,Ordinal,Device>(
                nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);

            Kokkos::Threads::finalize();
        }
#endif

#ifdef KOKKOS_HAVE_CUDA
        if (cuda) {
            typedef Kokkos::Cuda Device;

            Kokkos::HostSpace::execution_space::initialize();
            Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id));

            cudaDeviceProp deviceProp;
            cudaGetDeviceProperties(&deviceProp, device_id);
            std::cout << std::endl
                      << "CUDA performance for device " << device_id << " ("
                      << deviceProp.name << "):"
                      << std::endl;

            performance_test_driver<Scalar,Ordinal,Device>(
                nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);

            Kokkos::HostSpace::execution_space::finalize();
            Kokkos::Cuda::finalize();
        }
#endif

    }
    TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);

    if (success)
        return 0;
    return -1;
}
int main (int argc, char *argv[]) {

    Teuchos::CommandLineProcessor clp;
    clp.setDocString("This example program measure the performance of dense Herk on Kokkos::Threads execution space.\n");

    int nthreads = 0;
    clp.setOption("nthreads", &nthreads, "Number of threads");

    int numa = 0;
    clp.setOption("numa", &numa, "Number of numa node");

    int core_per_numa = 0;
    clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node");

    int max_concurrency = 250000;
    clp.setOption("max-concurrency", &max_concurrency, "Max number of concurrent tasks");

    int memory_pool_grain_size = 16;
    clp.setOption("memory-pool-grain-size", &memory_pool_grain_size, "Memorypool chunk size (12 - 16)");

    int mkl_nthreads = 1;
    clp.setOption("mkl-nthreads", &mkl_nthreads, "MKL threads for nested parallelism");

    bool verbose = false;
    clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

    int mmin = 1000;
    clp.setOption("mmin", &mmin, "C(mmin,mmin)");

    int mmax = 8000;
    clp.setOption("mmax", &mmax, "C(mmax,mmax)");

    int minc = 1000;
    clp.setOption("minc", &minc, "Increment of m");

    int k = 1024;
    clp.setOption("k", &k, "A(mmax,k) or A(k,mmax) according to transpose flags");

    int mb = 256;
    clp.setOption("mb", &mb, "Blocksize");

    bool check = true;
    clp.setOption("enable-check", "disable-check", &check, "Flag for check solution");

    clp.recogniseAllOptions(true);
    clp.throwExceptions(false);

    Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

    if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
    if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;

    int r_val = 0;
    {
        exec_space::initialize(nthreads, numa, core_per_numa);

        std::cout << std::endl << "DenseHerkByBlocks:: Upper, ConjTranspose, Variant::One (external)" << std::endl;
        r_val = exampleDenseHerkByBlocks
                <Uplo::Upper,Trans::ConjTranspose,Variant::One,exec_space>
                (mmin, mmax, minc, k, mb,
                 max_concurrency, memory_pool_grain_size, mkl_nthreads,
                 check,
                 verbose);

        exec_space::finalize();
    }

    return r_val;
}
Example #26
0
int main(int argc, char **argv)
{
  try {

    // Initialize MPI
#ifdef HAVE_MPI
    MPI_Init(&argc,&argv);
#endif

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This example generates the sparsity pattern for the block stochastic Galerkin matrix.\n");
    int d = 3;
    CLP.setOption("dimension", &d, "Stochastic dimension");
    int p = 5;
    CLP.setOption("order", &p, "Polynomial order");
    double drop = 1.0e-12;
    CLP.setOption("drop", &drop, "Drop tolerance");
    std::string file = "A.mm";
    CLP.setOption("filename", &file, "Matrix Market filename");
    BasisType basis_type = LEGENDRE;
    CLP.setOption("basis", &basis_type, 
		  num_basis_types, basis_type_values, basis_type_names, 
		  "Basis type");
    Stokhos::GrowthPolicy growth_type = Stokhos::SLOW_GROWTH;
    CLP.setOption("growth", &growth_type, 
		  num_growth_types, growth_type_values, growth_type_names, 
		  "Growth type");
    ProductBasisType prod_basis_type = COMPLETE;
    CLP.setOption("product_basis", &prod_basis_type, 
		  num_prod_basis_types, prod_basis_type_values, 
		  prod_basis_type_names, 
		  "Product basis type");
    double alpha = 1.0;
    CLP.setOption("alpha", &alpha, "Jacobi alpha index");
    double beta = 1.0;
    CLP.setOption("beta", &beta, "Jacobi beta index");
    bool full = true;
    CLP.setOption("full", "linear", &full, "Use full or linear expansion");
    bool use_old = false;
    CLP.setOption("old", "new", &use_old, "Use old or new Cijk algorithm");
    int tile_size = 100;
    CLP.setOption("tile_size", &tile_size, "Tile size");

    // Parse arguments
    CLP.parse( argc, argv );

    // Basis
    Array< RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); 
    for (int i=0; i<d; i++) {
      if (basis_type == HERMITE)
	bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>(
				  p, true, growth_type));
      else if (basis_type == LEGENDRE)
	bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>(
				  p, true, growth_type));
      else if (basis_type == CC_LEGENDRE)
	bases[i] = 
	  Teuchos::rcp(new Stokhos::ClenshawCurtisLegendreBasis<int,double>(
			 p, true));
      else if (basis_type == GP_LEGENDRE)
	bases[i] = 
	  Teuchos::rcp(new Stokhos::GaussPattersonLegendreBasis<int,double>(
			 p, true));
      else if (basis_type == RYS)
	bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>(
				  p, 1.0, true, growth_type));
      else if (basis_type == JACOBI)
	bases[i] = Teuchos::rcp(new Stokhos::JacobiBasis<int,double>(
				  p, alpha, beta, true, growth_type));
    }
    RCP<const Stokhos::ProductBasis<int,double> > basis;
    if (prod_basis_type == COMPLETE)
      basis = 
	Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>(
		       bases, drop, use_old));
    else if (prod_basis_type == TENSOR)
      basis = 
	Teuchos::rcp(new Stokhos::TensorProductBasis<int,double>(
		       bases, drop));
    else if (prod_basis_type == TOTAL)
      basis = 
	Teuchos::rcp(new Stokhos::TotalOrderBasis<int,double>(
		       bases, drop));
    else if (prod_basis_type == SMOLYAK) {
      Stokhos::TotalOrderIndexSet<int> index_set(d, p);
      basis = 
	Teuchos::rcp(new Stokhos::SmolyakBasis<int,double>(
		       bases, index_set, drop));
    }

    // Triple product tensor
    typedef Stokhos::Sparse3Tensor<int,double> Cijk_type;
    RCP<Cijk_type> Cijk;
    if (full)
      Cijk = basis->computeTripleProductTensor();
    else
      Cijk = basis->computeLinearTripleProductTensor();

    int sz = basis->size();
    std::cout << "basis size = " << sz
	      << " num nonzero Cijk entries = " << Cijk->num_entries() 
	      << std::endl;

    // Setup tiles
    if (tile_size > sz)
      tile_size = sz;
    int j_sz = sz;
    int k_sz = sz;
    if (!full)
      k_sz = basis->dimension()+1;
    int nj_tiles = j_sz / tile_size;
    int nk_tiles = k_sz / tile_size;
    if (j_sz - nj_tiles*tile_size > 0)
      ++nj_tiles;
    if (k_sz - nk_tiles*tile_size > 0)
      ++nk_tiles;
    Array<CijkNonzeros> nz(sz);
    for (int i=0; i<sz; ++i) {
      nz[i].i = i;
      nz[i].nz_tiles.resize(nj_tiles);
      for (int j=0; j<nj_tiles; ++j)
	nz[i].nz_tiles[j].resize(nk_tiles);
    }

    // Get number of nonzeros in Cijk for each i
    Cijk_type::k_iterator k_begin = Cijk->k_begin();
    Cijk_type::k_iterator k_end = Cijk->k_end();
    for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
      int k = index(k_it);
      int k_tile = k / tile_size;
      Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it);
      Cijk_type::kj_iterator j_end = Cijk->j_end(k_it);
      for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
	int j = index(j_it);
	int j_tile = j / tile_size;
	Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it);
	Cijk_type::kji_iterator i_end = Cijk->i_end(j_it);
	for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it) {
	  int i = index(i_it);
	  ++nz[i].total_nz;
	  ++nz[i].nz_tiles[j_tile][k_tile];
	}
      }
    }

    // Sort based on total number of nonzeros
    std::sort(nz.begin(), nz.end(), NZCompare());
    
    // Print nonzeros
    int w_index = 3;
    int w_nz = 5;
    int w_tile = 4;
    for (int i=0; i<nz.size(); ++i) {
      int idx = nz[i].i;
      std::cout << std::setw(w_index) << idx << " " 
		<< basis->term(idx) << ": " 
		<< std::setw(w_nz) << nz[i].total_nz
		<< ", ";
      for (int j=0; j<nj_tiles; ++j)
	for (int k=0; k<nk_tiles; ++k)
	  std::cout << std::setw(w_tile) << nz[i].nz_tiles[j][k] << " ";
      std::cout << std::endl;
    }

    // Add up the nonzeros for each (j,k) tile
    Array< Array<int> > total_nz_tiles(nj_tiles);
    int total_nz = 0;
    for (int j=0; j<nj_tiles; ++j)
      total_nz_tiles[j].resize(nk_tiles);
    for (int i=0; i<nz.size(); ++i) {
      total_nz += nz[i].total_nz;
      for (int j=0; j<nj_tiles; ++j)
	for (int k=0; k<nk_tiles; ++k)
	  total_nz_tiles[j][k] += nz[i].nz_tiles[j][k];
    }
    int w_total = (w_index+1) + (2*basis->dimension()+5) + w_nz;
    std::cout << std::endl << std::setw(w_total) << total_nz << ", ";
    for (int j=0; j<nj_tiles; ++j)
      for (int k=0; k<nk_tiles; ++k)
	std::cout << std::setw(w_tile) << total_nz_tiles[j][k] << " ";
    std::cout << std::endl;

    // Now partition Cijk for each tile
    Array< Array< RCP<Cijk_type> > > Cijk_tile(nj_tiles);
    for (int j=0; j<nj_tiles; ++j) {
      Cijk_tile[j].resize(nk_tiles);
      for (int k=0; k<nk_tiles; ++k)
	Cijk_tile[j][k] = rcp(new Cijk_type);
    }
    for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
      int k = index(k_it);
      int k_tile = k / tile_size;
      Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it);
      Cijk_type::kj_iterator j_end = Cijk->j_end(k_it);
      for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
	int j = index(j_it);
	int j_tile = j / tile_size;
	Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it);
	Cijk_type::kji_iterator i_end = Cijk->i_end(j_it);
	for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it) {
	  int i = index(i_it);
	  double c = value(i_it);
	  Cijk_tile[j_tile][k_tile]->add_term(i,j,k,c);
	}
      }
    }
    for (int j=0; j<nj_tiles; ++j)
      for (int k=0; k<nk_tiles; ++k)
	Cijk_tile[j][k]->fillComplete();

    
    Array< Array< std::map<int,int> > > nz_tile(nj_tiles);
    Array< Array< Array< std::pair<int,int> > > > sorted_nz_tile(nj_tiles);
    for (int j_tile=0; j_tile<nj_tiles; ++j_tile) {
      nz_tile[j_tile].resize(nk_tiles); 
      sorted_nz_tile[j_tile].resize(nk_tiles); 
      for (int k_tile=0; k_tile<nk_tiles; ++k_tile) {

	// Count nonzeros for each i, for each tile
	Cijk_type::k_iterator k_begin = Cijk_tile[j_tile][k_tile]->k_begin();
	Cijk_type::k_iterator k_end = Cijk_tile[j_tile][k_tile]->k_end();
	for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
	  //int k = index(k_it);
	  Cijk_type::kj_iterator j_begin = 
	    Cijk_tile[j_tile][k_tile]->j_begin(k_it);
	  Cijk_type::kj_iterator j_end = 
	    Cijk_tile[j_tile][k_tile]->j_end(k_it);
	  for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
	    //int j = index(j_it);
	    Cijk_type::kji_iterator i_begin = 
	      Cijk_tile[j_tile][k_tile]->i_begin(j_it);
	    Cijk_type::kji_iterator i_end = 
	      Cijk_tile[j_tile][k_tile]->i_end(j_it);
	    for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it){
	      int i = index(i_it);
	      if (nz_tile[j_tile][k_tile].count(i) == 0)
		nz_tile[j_tile][k_tile][i] = 1;
	      else
		++(nz_tile[j_tile][k_tile][i]);
	    }
	  }
	}

	// Sort based on non-zeros for each i, for each tile
	sorted_nz_tile[j_tile][k_tile].resize(nz_tile[j_tile][k_tile].size());
	int idx=0;
	for (std::map<int,int>::iterator it = nz_tile[j_tile][k_tile].begin();
	     it != nz_tile[j_tile][k_tile].end(); ++it) {
	  sorted_nz_tile[j_tile][k_tile][idx] = 
	    std::make_pair(it->first, it->second);
	  ++idx;
	}
	std::sort( sorted_nz_tile[j_tile][k_tile].begin(),
		   sorted_nz_tile[j_tile][k_tile].end(),
		   NZPairCompare() );

	// Print number of non-zeros for each i, for each tile
	std::cout << std::endl 
		  << "Tile (" << j_tile << ", " << k_tile << "):" << std::endl;
	for (int i=0; i<sorted_nz_tile[j_tile][k_tile].size(); ++i) {
	  int idx = sorted_nz_tile[j_tile][k_tile][i].first;
	  std::cout << std::setw(w_index) << idx << " " 
		    << basis->term(idx) << ": " 
		    << std::setw(w_nz) << sorted_nz_tile[j_tile][k_tile][i].second
		    << std::endl;
	  if (i % 32 == 31)
	    std::cout << std::endl;
	}
      }
    }
    
  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }

  return 0;
}
int main (int argc, char *argv[]) {

  Teuchos::CommandLineProcessor clp;
  clp.setDocString("This example program measure the performance of Chol algorithms on Kokkos::Threads execution space.\n");

  int nthreads = 1;
  clp.setOption("nthreads", &nthreads, "Number of threads");

  int max_task_dependence = 10;
  clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence");

  int team_size = 1;
  clp.setOption("team-size", &team_size, "Team size");

  int fill_level = 0;
  clp.setOption("fill-level", &fill_level, "Fill level");

  bool team_interface = true;
  clp.setOption("enable-team-interface", "disable-team-interface",
                &team_interface, "Flag for team interface");

  bool mkl_interface = false;
  clp.setOption("enable-mkl-interface", "disable-mkl-interface",
                &mkl_interface, "Flag for MKL interface");

  int stack_size = 8192;
  clp.setOption("stack-size", &stack_size, "Stack size");

  bool verbose = false;
  clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing");

  string file_input = "test.mtx";
  clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)");

  int treecut = 15;
  clp.setOption("treecut", &treecut, "Level to cut tree from bottom");

  int minblksize = 0;
  clp.setOption("minblksize", &minblksize, "Minimum block size for internal reordering");

  int prunecut = 0;
  clp.setOption("prunecut", &prunecut, "Leve to prune tree from bottom");

  int seed = 0;
  clp.setOption("seed", &seed, "Seed for random number generator in graph partition");

  int niter = 10;
  clp.setOption("niter", &niter, "Number of iterations for testing");

  clp.recogniseAllOptions(true);
  clp.throwExceptions(false);

  Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv );

  if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0;
  if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL  ) return -1;
  
  int r_val = 0;
  {
    const bool overwrite = true;
    const int nshepherds = (team_interface ? nthreads/team_size : nthreads);
    const int nworker_per_shepherd = nthreads/nshepherds;

    setenv("QT_HWPAR",                    to_string(nthreads).c_str(),             overwrite);
    setenv("QT_NUM_SHEPHERDS",            to_string(nshepherds).c_str(),           overwrite);
    setenv("QT_NUM_WORKERS_PER_SHEPHERD", to_string(nworker_per_shepherd).c_str(), overwrite);
    setenv("QT_STACK_SIZE",               to_string(stack_size).c_str(),           overwrite);

    exec_space::initialize(nthreads);
    exec_space::print_configuration(cout, true);
    
    r_val = exampleCholPerformance
      <value_type,ordinal_type,size_type,exec_space,void>
      (file_input, 
       treecut,
       minblksize,
       prunecut,
       seed,
       niter, 
       nthreads, 
       max_task_dependence, 
       team_size, 
       fill_level,
       nshepherds,
       team_interface, 
       (nthreads != 1), 
       mkl_interface,
       verbose);

    exec_space::finalize();

    unsetenv("QT_HWPAR");
    unsetenv("QT_NUM_SHEPHERDS");
    unsetenv("QT_NUM_WORKERS_PER_SHEPHERD");
    unsetenv("QT_STACK_SIZE");
  }

  return r_val;
}
Example #28
0
int main(int argc, char* argv[]) {
  int ierr = 0;

  try {
    double t, tb;
    int p = 2;
    int w = p+7;

    // Set up command line options
    Teuchos::CommandLineProcessor clp;
    clp.setDocString("This program tests the speed of differentiating BLAS routines using Fad");
    int m = 10;
    clp.setOption("m", &m, "Number of rows");
    int n = 10;
    clp.setOption("n", &n, "Number of columns");
    int k = 10;
    clp.setOption("k", &k, "Number of columns for GEMM");
    int ndot = 10;
    clp.setOption("ndot", &ndot, "Number of derivative components");
    int nloop = 100000;
    clp.setOption("nloop", &nloop, "Number of loops");
    int dynamic = 1;
    clp.setOption("dynamic", &dynamic, "Use dynamic allocation");

    // Parse options
    Teuchos::CommandLineProcessor::EParseCommandLineReturn
      parseReturn= clp.parse(argc, argv);
    if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
      return 1;
    bool use_dynamic = (dynamic != 0);

    std::cout.setf(std::ios::scientific);
    std::cout.precision(p);
    std::cout << "Times (sec) for m = " << m << ", n = " << n 
	      << ", ndot = " << ndot << ", nloop =  " << nloop 
	      << ", dynamic = " << use_dynamic << ":  " 
	      << std::endl;

    tb = do_time_teuchos_double_gemm(m,n,k,nloop);
    std::cout << "GEMM:                 " << std::setw(w) << tb << std::endl;

    t = do_time_sacado_fad_gemm< Sacado::Fad::DVFad<double> >(m,n,k,ndot,nloop,use_dynamic);
    std::cout << "Sacado DVFad GEMM:    " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;

    t = do_time_sacado_fad_gemm< Sacado::Fad::DFad<double> >(m,n,k,ndot,nloop,use_dynamic);
    std::cout << "Sacado DFad GEMM:     " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;
    
    t = do_time_teuchos_fad_gemm< Sacado::Fad::DFad<double> >(m,n,k,ndot,nloop);
    std::cout << "Teuchos DFad GEMM:    " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;

    // t = do_time_teuchos_fad_gemm< Sacado::ELRFad::DFad<double> >(m,n,k,ndot,nloop);
    // std::cout << "Teuchos ELRDFad GEMM:  " << std::setw(w) << t << "\t" 
    // 	      << std::setw(w) << t/tb << std::endl;

    t = do_time_teuchos_fad_gemm< Sacado::Fad::DVFad<double> >(m,n,k,ndot,nloop);
    std::cout << "Teuchos DVFad GEMM:   " << std::setw(w) << t << "\t" 
    	      << std::setw(w) << t/tb << std::endl;
    
    std::cout << std::endl;

    tb = do_time_teuchos_double_gemv(m,n,nloop);
    std::cout << "GEMV:                 " << std::setw(w) << tb << std::endl;

    t = do_time_sacado_fad_gemv< Sacado::Fad::DVFad<double> >(m,n,ndot,nloop*10,use_dynamic);
    std::cout << "Sacado DVFad GEMV:    " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;

    t = do_time_sacado_fad_gemv< Sacado::Fad::DFad<double> >(m,n,ndot,nloop*10,use_dynamic);
    std::cout << "Sacado DFad GEMV:     " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;
    
    t = do_time_teuchos_fad_gemv< Sacado::Fad::DFad<double> >(m,n,ndot,nloop*10);
    std::cout << "Teuchos DFad GEMV:    " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;

    // t = do_time_teuchos_fad_gemv< Sacado::ELRFad::DFad<double> >(m,n,ndot,nloop*10);
    // std::cout << "Teuchos ELRDFad GEMV:  " << std::setw(w) << t << "\t" 
    // 	      << std::setw(w) << t/tb << std::endl;

    t = do_time_teuchos_fad_gemv< Sacado::Fad::DVFad<double> >(m,n,ndot,nloop*10);
    std::cout << "Teuchos DVFad GEMV:   " << std::setw(w) << t << "\t" 
    	      << std::setw(w) << t/tb << std::endl;
    
    std::cout << std::endl;

    tb = do_time_teuchos_double_dot(m,nloop*100);
    std::cout << "DOT:                  " << std::setw(w) << tb << std::endl;

    t = do_time_sacado_fad_dot< Sacado::Fad::DVFad<double> >(m,ndot,nloop*100,use_dynamic);
    std::cout << "Sacado DVFad DOT:     " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;

    t = do_time_sacado_fad_dot< Sacado::Fad::DFad<double> >(m,ndot,nloop*100,use_dynamic);
    std::cout << "Sacado DFad DOT:      " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;
    
    t = do_time_teuchos_fad_dot< Sacado::Fad::DFad<double> >(m,ndot,nloop*100);
    std::cout << "Teuchos DFad DOT:     " << std::setw(w) << t << "\t" 
	      << std::setw(w) << t/tb << std::endl;

    // t = do_time_teuchos_fad_dot< Sacado::ELRFad::DFad<double> >(m,ndot,nloop*100);
    // std::cout << "Teuchos ELRDFad DOT:  " << std::setw(w) << t << "\t" 
    // 	      << std::setw(w) << t/tb << std::endl;

    t = do_time_teuchos_fad_dot< Sacado::Fad::DVFad<double> >(m,ndot,nloop*100);
    std::cout << "Teuchos DVFad DOT:    " << std::setw(w) << t << "\t" 
    	      << std::setw(w) << t/tb << std::endl;
    
  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
    ierr = 1;
  }
  catch (const char *s) {
    std::cout << s << std::endl;
    ierr = 1;
  }
  catch (...) {
    std::cout << "Caught unknown exception!" << std::endl;
    ierr = 1;
  }

  return ierr;
}
Example #29
0
int main(int argc, char* argv[]) {
  int ierr = 0;

  try {
    double t, ta, tr;
    int p = 2;
    int w = p+7;

    // Maximum number of derivative components for SLFad
    const int slfad_max = 130;

    // Set up command line options
    Teuchos::CommandLineProcessor clp;
    clp.setDocString("This program tests the speed of various forward mode AD implementations for a finite-element-like Jacobian fill");
    int num_nodes = 100000;
    int num_eqns = 2;
    int rt = 0;
    clp.setOption("n", &num_nodes, "Number of nodes");
    clp.setOption("p", &num_eqns, "Number of equations");
    clp.setOption("rt", &rt, "Include ADOL-C retaping test");

    // Parse options
    Teuchos::CommandLineProcessor::EParseCommandLineReturn
      parseReturn= clp.parse(argc, argv);
    if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
      return 1;

    double mesh_spacing = 1.0 / static_cast<double>(num_nodes - 1);

    // Memory pool & manager
    Sacado::Fad::MemPoolManager<double> poolManager(num_nodes*num_eqns);
    Sacado::Fad::MemPool* pool = poolManager.getMemoryPool(num_nodes*num_eqns);
    Sacado::Fad::DMFad<double>::setDefaultPool(pool);

    std::cout.setf(std::ios::scientific);
    std::cout.precision(p);
    std::cout << "num_nodes =  " << num_nodes
        << ", num_eqns = " << num_eqns << ":  " << std::endl
        << "               " << "   Time   " << "\t"<< "Time/Analytic" << "\t"
        << "Time/(2*p*Residual)" << std::endl;

    ta = 1.0;
    tr = 1.0;

    tr = residual_fill(num_nodes, num_eqns, mesh_spacing);

    ta = analytic_jac_fill(num_nodes, num_eqns, mesh_spacing);
    std::cout << "Analytic:      " << std::setw(w) << ta << "\t" << std::setw(w) << ta/ta << "\t" << std::setw(w) << ta/(2.0*num_eqns*tr) << std::endl;

#ifdef HAVE_ADOLC
#ifndef ADOLC_TAPELESS
    t = adolc_jac_fill(num_nodes, num_eqns, mesh_spacing);
    std::cout << "ADOL-C:        " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    if (rt != 0) {
      t = adolc_retape_jac_fill(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ADOL-C(rt):  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

#else
    t = adolc_tapeless_jac_fill(num_nodes, num_eqns, mesh_spacing);
    std::cout << "ADOL-C(tl):    " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
#endif
#endif

#ifdef HAVE_ADIC
    t = adic_jac_fill(num_nodes, num_eqns, mesh_spacing);
    std::cout << "ADIC:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
#endif

    if (num_eqns*2 == 4) {
      t = fad_jac_fill< FAD::TFad<16,double> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "TFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 16) {
      t = fad_jac_fill< FAD::TFad<16,double> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "TFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 32) {
      t = fad_jac_fill< FAD::TFad<32,double> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "TFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 64) {
      t = fad_jac_fill< FAD::TFad<64,double> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "TFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    t = fad_jac_fill< FAD::Fad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "Fad:           " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    if (num_eqns*2 == 4) {
      t = fad_jac_fill< Sacado::Fad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "SFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 16) {
      t = fad_jac_fill< Sacado::Fad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "SFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 32) {
      t = fad_jac_fill< Sacado::Fad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "SFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 64) {
      t = fad_jac_fill< Sacado::Fad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "SFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    if (num_eqns*2 < slfad_max) {
      t = fad_jac_fill< Sacado::Fad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "SLFad:         " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    t = fad_jac_fill< Sacado::Fad::DFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "DFad:          " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    t = fad_jac_fill< Sacado::Fad::SimpleFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "SimpleFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    t = fad_jac_fill< Sacado::Fad::DMFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "DMFad:         " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    if (num_eqns*2 == 4) {
      t = fad_jac_fill< Sacado::ELRFad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRSFad:       " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 16) {
      t = fad_jac_fill< Sacado::ELRFad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRSFad:       " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 32) {
      t = fad_jac_fill< Sacado::ELRFad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRSFad:       " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 64) {
      t = fad_jac_fill< Sacado::ELRFad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRSFad:       " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    if (num_eqns*2 < slfad_max) {
      t = fad_jac_fill< Sacado::ELRFad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRSLFad:      " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    t = fad_jac_fill< Sacado::ELRFad::DFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "ELRDFad:       " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    if (num_eqns*2 == 4) {
      t = fad_jac_fill< Sacado::CacheFad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "CacheSFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 16) {
      t = fad_jac_fill< Sacado::CacheFad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "CacheSFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 32) {
      t = fad_jac_fill< Sacado::CacheFad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "CacheSFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 64) {
      t = fad_jac_fill< Sacado::CacheFad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "CacheSFad:     " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    if (num_eqns*2 < slfad_max) {
      t = fad_jac_fill< Sacado::CacheFad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "CacheSLFad:    " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    t = fad_jac_fill< Sacado::CacheFad::DFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "CacheFad:      " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    if (num_eqns*2 == 4) {
      t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRCacheSFad:  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 16) {
      t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRCacheSFad:  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 32) {
      t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRCacheSFad:  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }
    else if (num_eqns*2 == 64) {
      t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRCacheSFad:  " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    if (num_eqns*2 < slfad_max) {
      t = fad_jac_fill< Sacado::ELRCacheFad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing);
      std::cout << "ELRCacheSLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;
    }

    t = fad_jac_fill< Sacado::ELRCacheFad::DFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "ELRCacheFad:   " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

    t = fad_jac_fill< Sacado::Fad::DVFad<double> >(num_nodes, num_eqns, mesh_spacing);
    std::cout << "DVFad:         " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl;

  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
    ierr = 1;
  }
  catch (const char *s) {
    std::cout << s << std::endl;
    ierr = 1;
  }
  catch (...) {
    std::cout << "Caught unknown exception!" << std::endl;
    ierr = 1;
  }

  return ierr;
}
int main(int argc, char **argv)
{
  // Typename of Polynomial Chaos scalar type
  typedef Stokhos::StandardStorage<int,double> pce_storage_type;
  typedef Sacado::ETPCE::OrthogPoly<double, pce_storage_type> pce_type;

  // Typename of ensemble scalar type
  const int EnsembleSize = 8;
  typedef Stokhos::StaticFixedStorage<int,double,EnsembleSize,Kokkos::DefaultExecutionSpace> ensemble_storage_type;
  typedef Sacado::MP::Vector<ensemble_storage_type> ensemble_type;

  // Short-hand for several classes used below
  using Teuchos::Array;
  using Teuchos::RCP;
  using Teuchos::rcp;
  using Stokhos::OneDOrthogPolyBasis;
  using Stokhos::HermiteBasis;
  using Stokhos::LegendreBasis;
  using Stokhos::CompletePolynomialBasis;
  using Stokhos::Quadrature;
  using Stokhos::TotalOrderIndexSet;
  using Stokhos::SmolyakSparseGridQuadrature;
  using Stokhos::TensorProductQuadrature;
  using Stokhos::Sparse3Tensor;
  using Stokhos::QuadOrthogPolyExpansion;

  try {

    // Setup command line options
    Teuchos::CommandLineProcessor CLP;
    CLP.setDocString(
      "This example computes the PC expansion of a simple function.\n");
    int p = 4;
    CLP.setOption("order", &p, "Polynomial order");
    bool sparse = false;
    CLP.setOption("sparse", "tensor", &sparse,
                  "Use sparse grid or tensor product quadrature");

    // Parse arguments
    CLP.parse( argc, argv );

    // Basis of dimension 3, order given by command-line option
    const int d = 3;
    Array< RCP<const OneDOrthogPolyBasis<int,double> > > bases(d);
    for (int i=0; i<d; i++) {
      bases[i] = rcp(new HermiteBasis<int,double>(p, true));
    }
    RCP<const CompletePolynomialBasis<int,double> > basis =
      rcp(new CompletePolynomialBasis<int,double>(bases));
    const int pce_size = basis->size();
    std::cout << "basis size = " << pce_size << std::endl;

    // Quadrature method
    RCP<const Quadrature<int,double> > quad;
    if (sparse) {
      const TotalOrderIndexSet<int> index_set(d, p);
      quad = rcp(new SmolyakSparseGridQuadrature<int,double>(basis, index_set));
    }
    else {
      quad = rcp(new TensorProductQuadrature<int,double>(basis));
    }
    std::cout << "quadrature size = " << quad->size() << std::endl;

    // Triple product tensor
    RCP<Sparse3Tensor<int,double> > Cijk =
      basis->computeTripleProductTensor();

    // Expansion method
    RCP<QuadOrthogPolyExpansion<int,double> > expn =
      rcp(new QuadOrthogPolyExpansion<int,double>(basis, Cijk, quad));

    // Polynomial expansion of u (note:  these are coefficients in the
    // normalized basis)
    pce_type u(expn);
    u.term(0,0) = 1.0;     // zeroth order term
    u.term(0,1) = 0.1;     // first order term for dimension 0
    u.term(1,1) = 0.05;    // first order term for dimension 1
    u.term(2,1) = 0.01;    // first order term for dimension 2

    //
    // Compute PCE expansion of function using NISP with ensemble propagation
    //

    // Extract quadrature data
    const int num_quad_points                 = quad->size();
    const Array<double>& quad_weights         = quad->getQuadWeights();
    const Array< Array<double> >& quad_points = quad->getQuadPoints();
    const Array< Array<double> >& quad_values = quad->getBasisAtQuadPoints();

    // Loop over quadrature points in blocks of size EnsembleSize
    pce_type v(expn);
    ensemble_type u_ensemble;
    for (int qp_block=0; qp_block<num_quad_points; qp_block+=EnsembleSize) {
      const int qp_sz = qp_block+EnsembleSize <= num_quad_points ?
        EnsembleSize : num_quad_points-qp_block;

      // Evaluate u at each quadrature point
      for (int qp=0; qp<qp_sz; ++qp)
        u_ensemble.fastAccessCoeff(qp) =
          u.evaluate(quad_points[qp_block+qp], quad_values[qp_block+qp]);
      for (int qp=qp_sz; qp<EnsembleSize; ++qp)
        u_ensemble.fastAccessCoeff(qp) = u_ensemble.fastAccessCoeff(qp_sz-1);

      // Evaluate function at each quadrature point
      ensemble_type v_ensemble = simple_function(u_ensemble);

      // Sum results into PCE integral
      for (int pc=0; pc<pce_size; ++pc)
        for (int qp=0; qp<qp_sz; ++qp)
          v.fastAccessCoeff(pc) += v_ensemble.fastAccessCoeff(qp)*quad_weights[qp_block+qp]*quad_values[qp_block+qp][pc];
    }

    /*
    for (int qp=0; qp<num_quad_points; ++qp) {
      double u_qp = u.evaluate(quad_points[qp]);
      double v_qp = simple_function(u_qp);
      double w = quad_weights[qp];
      for (int pc=0; pc<pce_size; ++pc)
        v.fastAccessCoeff(pc) += v_qp*w*quad_values[qp][pc];
    }
    */

    // Print u and v
    std::cout << "\tu = ";
    u.print(std::cout);
    std::cout << "\tv = ";
    v.print(std::cout);

    // Compute moments
    double mean = v.mean();
    double std_dev = v.standard_deviation();

    // Evaluate PCE and function at a point = 0.25 in each dimension
    Teuchos::Array<double> pt(d);
    for (int i=0; i<d; i++)
      pt[i] = 0.25;
    double up = u.evaluate(pt);
    double vp = simple_function(up);
    double vp2 = v.evaluate(pt);

    // Print results
    std::cout << "\tv mean         = " << mean << std::endl;
    std::cout << "\tv std. dev.    = " << std_dev << std::endl;
    std::cout << "\tv(0.25) (true) = " << vp << std::endl;
    std::cout << "\tv(0.25) (pce)  = " << vp2 << std::endl;

     // Check the answer
    if (std::abs(vp - vp2) < 1e-2)
      std::cout << "\nExample Passed!" << std::endl;
  }
  catch (std::exception& e) {
    std::cout << e.what() << std::endl;
  }
}