int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("Tacho::DenseMatrixBase examples on Pthreads execution space.\n"); int nthreads = 0; clp.setOption("nthreads", &nthreads, "Number of threads"); int numa = 0; clp.setOption("numa", &numa, "Number of numa node"); int core_per_numa = 0; clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); std::string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int treecut = 0; clp.setOption("treecut", &treecut, "Level to cut tree from bottom"); int prunecut = 0; clp.setOption("prunecut", &prunecut, "Level to prune tree from bottom"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(nthreads, numa, core_per_numa); #if (defined(HAVE_SHYLUTACHO_SCOTCH) && (defined(HAVE_SHYLUTACHO_CHOLMOD) \ || defined(HAVE_SHYLUTACHO_AMESOS))) r_val = exampleGraphTools<exec_space> (file_input, treecut, prunecut, verbose); #else r_val = -1; std::cout << "Scotch or Cholmod is NOT configured in Trilinos" << std::endl; #endif exec_space::finalize(); } return r_val; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("Tacho::DenseMatrixBase examples on Pthreads execution space.\n"); int nthreads = 0; clp.setOption("nthreads", &nthreads, "Number of threads"); int numa = 0; clp.setOption("numa", &numa, "Number of numa node"); int core_per_numa = 0; clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); int mmin = 1000; clp.setOption("mmin", &mmin, "C(mmin,mmin)"); int mmax = 8000; clp.setOption("mmax", &mmax, "C(mmax,mmax)"); int minc = 1000; clp.setOption("minc", &minc, "Increment of m"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(); host_space::initialize(nthreads, numa, core_per_numa); r_val = exampleDenseMatrixBase<exec_space> (mmin, mmax, minc, verbose); exec_space::finalize(); host_space::finalize(); } return r_val; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example interface of solver Kokkos::Threads execution space.\n"); int nthreads = 1; clp.setOption("nthreads", &nthreads, "Number of threads"); int numa = 0; clp.setOption("numa", &numa, "Number of numa node"); int core_per_numa = 0; clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int nrhs = 1; clp.setOption("nrhs", &nrhs, "Numer of right hand side"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(nthreads, numa, core_per_numa); exec_space::print_configuration(cout, true); r_val = exampleCholDirectSolver <value_type,ordinal_type,size_type,exec_space,void> (file_input, nrhs, nthreads, verbose); exec_space::finalize(); } return r_val; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program demonstrates TriSolveUnblocked algorithm on Kokkos::Serial execution space.\n"); int nthreads = 1; clp.setOption("nthreads", &nthreads, "Number of threads"); int max_task_dependence = 10; clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence"); int team_size = 1; clp.setOption("team-size", &team_size, "Team size"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int nrhs = 1; clp.setOption("nrhs", &nrhs, "Number of right hand side"); int nb = nrhs; clp.setOption("nb", &nb, "Blocksize of right hand side"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(nthreads); exec_space::print_configuration(cout, true); r_val = exampleTriSolveByBlocks <value_type,ordinal_type,size_type,exec_space,void> (file_input, nrhs, nb, nthreads, max_task_dependence, team_size, verbose); exec_space::finalize(); } return r_val; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program measure the performance of task data parallelism (barrier) on Kokkos::Threads execution space.\n"); int nthreads = 0; clp.setOption("nthreads", &nthreads, "Number of threads"); int numa = 0; clp.setOption("numa", &numa, "Number of numa node"); int core_per_numa = 0; clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node"); int league_size = 1; clp.setOption("league-size", &league_size, "League size"); int team_size = 1; clp.setOption("team-size", &team_size, "Team size"); int ntasks = 100; clp.setOption("ntasks", &ntasks, "Number of tasks to be spawned"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(nthreads, numa, core_per_numa); exec_space::print_configuration(cout, true); r_val = exampleKokkosDataData<exec_space,value_type>((ntasks > MAXTASKS ? MAXTASKS : ntasks), league_size, team_size, verbose); exec_space::finalize(); } return r_val; }
int main(int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("Intrepid2::DynRankView_PerfTest01.\n"); int nworkset = 8; clp.setOption("nworkset", &nworkset, "# of worksets"); int C = 4096; clp.setOption("C", &C, "# of Cells in a workset"); int order = 2; clp.setOption("order", &order, "cubature order"); bool verbose = true; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; Kokkos::initialize(); if (verbose) std::cout << "Testing datatype double\n"; const int r_val_double = Intrepid2::Test::ComputeBasis_HGRAD <double,Kokkos::Cuda>(nworkset, C, order, verbose); return r_val_double; }
int main(int argc, char *argv[]) { // Initialize MPI #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // Create a communicator for Epetra objects Teuchos::RCP<const Epetra_Comm> globalComm; #ifdef HAVE_MPI globalComm = Teuchos::rcp(new Epetra_MpiComm(MPI_COMM_WORLD)); #else globalComm = Teuchos::rcp(new Epetra_SerialComm); #endif int MyPID = globalComm->MyPID(); try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example runs a variety of stochastic Galerkin solvers.\n"); int n = 32; CLP.setOption("num_mesh", &n, "Number of mesh points in each direction"); bool symmetric = false; CLP.setOption("symmetric", "unsymmetric", &symmetric, "Symmetric discretization"); int num_spatial_procs = -1; CLP.setOption("num_spatial_procs", &num_spatial_procs, "Number of spatial processors (set -1 for all available procs)"); bool rebalance_stochastic_graph = false; CLP.setOption("rebalance", "no-rebalance", &rebalance_stochastic_graph, "Rebalance parallel stochastic graph (requires Isorropia)"); SG_RF randField = UNIFORM; CLP.setOption("rand_field", &randField, num_sg_rf, sg_rf_values, sg_rf_names, "Random field type"); double mean = 0.2; CLP.setOption("mean", &mean, "Mean"); double sigma = 0.1; CLP.setOption("std_dev", &sigma, "Standard deviation"); double weightCut = 1.0; CLP.setOption("weight_cut", &weightCut, "Weight cut"); int num_KL = 2; CLP.setOption("num_kl", &num_KL, "Number of KL terms"); int p = 3; CLP.setOption("order", &p, "Polynomial order"); bool normalize_basis = true; CLP.setOption("normalize", "unnormalize", &normalize_basis, "Normalize PC basis"); SG_Solver solve_method = SG_KRYLOV; CLP.setOption("sg_solver", &solve_method, num_sg_solver, sg_solver_values, sg_solver_names, "SG solver method"); Krylov_Method outer_krylov_method = GMRES; CLP.setOption("outer_krylov_method", &outer_krylov_method, num_krylov_method, krylov_method_values, krylov_method_names, "Outer Krylov method (for Krylov-based SG solver)"); Krylov_Solver outer_krylov_solver = AZTECOO; CLP.setOption("outer_krylov_solver", &outer_krylov_solver, num_krylov_solver, krylov_solver_values, krylov_solver_names, "Outer linear solver"); double outer_tol = 1e-12; CLP.setOption("outer_tol", &outer_tol, "Outer solver tolerance"); int outer_its = 1000; CLP.setOption("outer_its", &outer_its, "Maximum outer iterations"); Krylov_Method inner_krylov_method = GMRES; CLP.setOption("inner_krylov_method", &inner_krylov_method, num_krylov_method, krylov_method_values, krylov_method_names, "Inner Krylov method (for G-S, Jacobi, etc...)"); Krylov_Solver inner_krylov_solver = AZTECOO; CLP.setOption("inner_krylov_solver", &inner_krylov_solver, num_krylov_solver, krylov_solver_values, krylov_solver_names, "Inner linear solver"); double inner_tol = 3e-13; CLP.setOption("inner_tol", &inner_tol, "Inner solver tolerance"); int inner_its = 1000; CLP.setOption("inner_its", &inner_its, "Maximum inner iterations"); SG_Op opMethod = MATRIX_FREE; CLP.setOption("sg_operator_method", &opMethod, num_sg_op, sg_op_values, sg_op_names, "Operator method"); SG_Prec precMethod = AGS; CLP.setOption("sg_prec_method", &precMethod, num_sg_prec, sg_prec_values, sg_prec_names, "Preconditioner method"); double gs_prec_tol = 1e-1; CLP.setOption("gs_prec_tol", &gs_prec_tol, "Gauss-Seidel preconditioner tolerance"); int gs_prec_its = 1; CLP.setOption("gs_prec_its", &gs_prec_its, "Maximum Gauss-Seidel preconditioner iterations"); CLP.parse( argc, argv ); if (MyPID == 0) { std::cout << "Summary of command line options:" << std::endl << "\tnum_mesh = " << n << std::endl << "\tsymmetric = " << symmetric << std::endl << "\tnum_spatial_procs = " << num_spatial_procs << std::endl << "\trebalance = " << rebalance_stochastic_graph << std::endl << "\trand_field = " << sg_rf_names[randField] << std::endl << "\tmean = " << mean << std::endl << "\tstd_dev = " << sigma << std::endl << "\tweight_cut = " << weightCut << std::endl << "\tnum_kl = " << num_KL << std::endl << "\torder = " << p << std::endl << "\tnormalize_basis = " << normalize_basis << std::endl << "\tsg_solver = " << sg_solver_names[solve_method] << std::endl << "\touter_krylov_method = " << krylov_method_names[outer_krylov_method] << std::endl << "\touter_krylov_solver = " << krylov_solver_names[outer_krylov_solver] << std::endl << "\touter_tol = " << outer_tol << std::endl << "\touter_its = " << outer_its << std::endl << "\tinner_krylov_method = " << krylov_method_names[inner_krylov_method] << std::endl << "\tinner_krylov_solver = " << krylov_solver_names[inner_krylov_solver] << std::endl << "\tinner_tol = " << inner_tol << std::endl << "\tinner_its = " << inner_its << std::endl << "\tsg_operator_method = " << sg_op_names[opMethod] << std::endl << "\tsg_prec_method = " << sg_prec_names[precMethod] << std::endl << "\tgs_prec_tol = " << gs_prec_tol << std::endl << "\tgs_prec_its = " << gs_prec_its << std::endl; } bool nonlinear_expansion = false; if (randField == UNIFORM || randField == RYS) nonlinear_expansion = false; else if (randField == LOGNORMAL) nonlinear_expansion = true; bool scaleOP = true; { TEUCHOS_FUNC_TIME_MONITOR("Total PCE Calculation Time"); // Create Stochastic Galerkin basis and expansion Teuchos::Array< Teuchos::RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(num_KL); for (int i=0; i<num_KL; i++) if (randField == UNIFORM) bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>(p,normalize_basis)); else if (randField == RYS) bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>(p,weightCut,normalize_basis)); else if (randField == LOGNORMAL) bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>(p,normalize_basis)); // bases[i] = Teuchos::rcp(new Stokhos::DiscretizedStieltjesBasis<int,double>("beta",p,&uniform_weight,-weightCut,weightCut,true)); Teuchos::RCP<const Stokhos::CompletePolynomialBasis<int,double> > basis = Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>(bases)); int sz = basis->size(); Teuchos::RCP<Stokhos::Sparse3Tensor<int,double> > Cijk; if (nonlinear_expansion) Cijk = basis->computeTripleProductTensor(sz); else Cijk = basis->computeTripleProductTensor(num_KL+1); Teuchos::RCP<Stokhos::OrthogPolyExpansion<int,double> > expansion = Teuchos::rcp(new Stokhos::AlgebraicOrthogPolyExpansion<int,double>(basis, Cijk)); if (MyPID == 0) std::cout << "Stochastic Galerkin expansion size = " << sz << std::endl; // Create stochastic parallel distribution Teuchos::ParameterList parallelParams; parallelParams.set("Number of Spatial Processors", num_spatial_procs); parallelParams.set("Rebalance Stochastic Graph", rebalance_stochastic_graph); Teuchos::RCP<Stokhos::ParallelData> sg_parallel_data = Teuchos::rcp(new Stokhos::ParallelData(basis, Cijk, globalComm, parallelParams)); Teuchos::RCP<const EpetraExt::MultiComm> sg_comm = sg_parallel_data->getMultiComm(); Teuchos::RCP<const Epetra_Comm> app_comm = sg_parallel_data->getSpatialComm(); // Create application Teuchos::RCP<twoD_diffusion_ME> model = Teuchos::rcp(new twoD_diffusion_ME(app_comm, n, num_KL, sigma, mean, basis, nonlinear_expansion, symmetric)); // Set up NOX parameters Teuchos::RCP<Teuchos::ParameterList> noxParams = Teuchos::rcp(new Teuchos::ParameterList); // Set the nonlinear solver method noxParams->set("Nonlinear Solver", "Line Search Based"); // Set the printing parameters in the "Printing" sublist Teuchos::ParameterList& printParams = noxParams->sublist("Printing"); printParams.set("MyPID", MyPID); printParams.set("Output Precision", 3); printParams.set("Output Processor", 0); printParams.set("Output Information", NOX::Utils::OuterIteration + NOX::Utils::OuterIterationStatusTest + NOX::Utils::InnerIteration + //NOX::Utils::Parameters + NOX::Utils::Details + NOX::Utils::LinearSolverDetails + NOX::Utils::Warning + NOX::Utils::Error); // Create printing utilities NOX::Utils utils(printParams); // Sublist for line search Teuchos::ParameterList& searchParams = noxParams->sublist("Line Search"); searchParams.set("Method", "Full Step"); // Sublist for direction Teuchos::ParameterList& dirParams = noxParams->sublist("Direction"); dirParams.set("Method", "Newton"); Teuchos::ParameterList& newtonParams = dirParams.sublist("Newton"); newtonParams.set("Forcing Term Method", "Constant"); // Sublist for linear solver for the Newton method Teuchos::ParameterList& lsParams = newtonParams.sublist("Linear Solver"); // Alternative linear solver list for Stratimikos Teuchos::ParameterList& stratLinSolParams = newtonParams.sublist("Stratimikos Linear Solver"); // Teuchos::ParameterList& noxStratParams = // stratLinSolParams.sublist("NOX Stratimikos Options"); Teuchos::ParameterList& stratParams = stratLinSolParams.sublist("Stratimikos"); // Sublist for convergence tests Teuchos::ParameterList& statusParams = noxParams->sublist("Status Tests"); statusParams.set("Test Type", "Combo"); statusParams.set("Number of Tests", 2); statusParams.set("Combo Type", "OR"); Teuchos::ParameterList& normF = statusParams.sublist("Test 0"); normF.set("Test Type", "NormF"); normF.set("Tolerance", outer_tol); normF.set("Scale Type", "Scaled"); Teuchos::ParameterList& maxIters = statusParams.sublist("Test 1"); maxIters.set("Test Type", "MaxIters"); maxIters.set("Maximum Iterations", 1); // Create NOX interface Teuchos::RCP<NOX::Epetra::ModelEvaluatorInterface> det_nox_interface = Teuchos::rcp(new NOX::Epetra::ModelEvaluatorInterface(model)); // Create NOX linear system object Teuchos::RCP<const Epetra_Vector> det_u = model->get_x_init(); Teuchos::RCP<Epetra_Operator> det_A = model->create_W(); Teuchos::RCP<NOX::Epetra::Interface::Required> det_iReq = det_nox_interface; Teuchos::RCP<NOX::Epetra::Interface::Jacobian> det_iJac = det_nox_interface; Teuchos::ParameterList det_printParams; det_printParams.set("MyPID", MyPID); det_printParams.set("Output Precision", 3); det_printParams.set("Output Processor", 0); det_printParams.set("Output Information", NOX::Utils::Error); Teuchos::ParameterList det_lsParams; Teuchos::ParameterList& det_stratParams = det_lsParams.sublist("Stratimikos"); if (inner_krylov_solver == AZTECOO) { det_stratParams.set("Linear Solver Type", "AztecOO"); Teuchos::ParameterList& aztecOOParams = det_stratParams.sublist("Linear Solver Types").sublist("AztecOO").sublist("Forward Solve"); Teuchos::ParameterList& aztecOOSettings = aztecOOParams.sublist("AztecOO Settings"); if (inner_krylov_method == GMRES) { aztecOOSettings.set("Aztec Solver","GMRES"); } else if (inner_krylov_method == CG) { aztecOOSettings.set("Aztec Solver","CG"); } aztecOOSettings.set("Output Frequency", 0); aztecOOSettings.set("Size of Krylov Subspace", 100); aztecOOParams.set("Max Iterations", inner_its); aztecOOParams.set("Tolerance", inner_tol); Teuchos::ParameterList& verbParams = det_stratParams.sublist("Linear Solver Types").sublist("AztecOO").sublist("VerboseObject"); verbParams.set("Verbosity Level", "none"); } else if (inner_krylov_solver == BELOS) { det_stratParams.set("Linear Solver Type", "Belos"); Teuchos::ParameterList& belosParams = det_stratParams.sublist("Linear Solver Types").sublist("Belos"); Teuchos::ParameterList* belosSolverParams = NULL; if (inner_krylov_method == GMRES || inner_krylov_method == FGMRES) { belosParams.set("Solver Type","Block GMRES"); belosSolverParams = &(belosParams.sublist("Solver Types").sublist("Block GMRES")); if (inner_krylov_method == FGMRES) belosSolverParams->set("Flexible Gmres", true); } else if (inner_krylov_method == CG) { belosParams.set("Solver Type","Block CG"); belosSolverParams = &(belosParams.sublist("Solver Types").sublist("Block CG")); } else if (inner_krylov_method == RGMRES) { belosParams.set("Solver Type","GCRODR"); belosSolverParams = &(belosParams.sublist("Solver Types").sublist("GCRODR")); } belosSolverParams->set("Convergence Tolerance", inner_tol); belosSolverParams->set("Maximum Iterations", inner_its); belosSolverParams->set("Output Frequency",0); belosSolverParams->set("Output Style",1); belosSolverParams->set("Verbosity",0); Teuchos::ParameterList& verbParams = belosParams.sublist("VerboseObject"); verbParams.set("Verbosity Level", "none"); } det_stratParams.set("Preconditioner Type", "ML"); Teuchos::ParameterList& det_ML = det_stratParams.sublist("Preconditioner Types").sublist("ML").sublist("ML Settings"); ML_Epetra::SetDefaults("SA", det_ML); det_ML.set("ML output", 0); det_ML.set("max levels",5); det_ML.set("increasing or decreasing","increasing"); det_ML.set("aggregation: type", "Uncoupled"); det_ML.set("smoother: type","ML symmetric Gauss-Seidel"); det_ML.set("smoother: sweeps",2); det_ML.set("smoother: pre or post", "both"); det_ML.set("coarse: max size", 200); #ifdef HAVE_ML_AMESOS det_ML.set("coarse: type","Amesos-KLU"); #else det_ML.set("coarse: type","Jacobi"); #endif Teuchos::RCP<NOX::Epetra::LinearSystem> det_linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos( det_printParams, det_lsParams, det_iJac, det_A, *det_u)); // Setup stochastic Galerkin algorithmic parameters Teuchos::RCP<Teuchos::ParameterList> sgParams = Teuchos::rcp(new Teuchos::ParameterList); Teuchos::ParameterList& sgOpParams = sgParams->sublist("SG Operator"); Teuchos::ParameterList& sgPrecParams = sgParams->sublist("SG Preconditioner"); if (!nonlinear_expansion) { sgParams->set("Parameter Expansion Type", "Linear"); sgParams->set("Jacobian Expansion Type", "Linear"); } if (opMethod == MATRIX_FREE) sgOpParams.set("Operator Method", "Matrix Free"); else if (opMethod == KL_MATRIX_FREE) sgOpParams.set("Operator Method", "KL Matrix Free"); else if (opMethod == KL_REDUCED_MATRIX_FREE) { sgOpParams.set("Operator Method", "KL Reduced Matrix Free"); if (randField == UNIFORM || randField == RYS) sgOpParams.set("Number of KL Terms", num_KL); else sgOpParams.set("Number of KL Terms", basis->size()); sgOpParams.set("KL Tolerance", outer_tol); sgOpParams.set("Sparse 3 Tensor Drop Tolerance", outer_tol); sgOpParams.set("Do Error Tests", true); } else if (opMethod == FULLY_ASSEMBLED) sgOpParams.set("Operator Method", "Fully Assembled"); else TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Error! Unknown operator method " << opMethod << "." << std::endl); if (precMethod == MEAN) { sgPrecParams.set("Preconditioner Method", "Mean-based"); sgPrecParams.set("Mean Preconditioner Type", "ML"); Teuchos::ParameterList& precParams = sgPrecParams.sublist("Mean Preconditioner Parameters"); precParams = det_ML; } else if(precMethod == GS) { sgPrecParams.set("Preconditioner Method", "Gauss-Seidel"); sgPrecParams.sublist("Deterministic Solver Parameters") = det_lsParams; sgPrecParams.set("Deterministic Solver", det_linsys); sgPrecParams.set("Max Iterations", gs_prec_its); sgPrecParams.set("Tolerance", gs_prec_tol); } else if (precMethod == AGS) { sgPrecParams.set("Preconditioner Method", "Approximate Gauss-Seidel"); if (outer_krylov_method == CG) sgPrecParams.set("Symmetric Gauss-Seidel", true); sgPrecParams.set("Mean Preconditioner Type", "ML"); Teuchos::ParameterList& precParams = sgPrecParams.sublist("Mean Preconditioner Parameters"); precParams = det_ML; } else if (precMethod == AJ) { sgPrecParams.set("Preconditioner Method", "Approximate Jacobi"); sgPrecParams.set("Mean Preconditioner Type", "ML"); Teuchos::ParameterList& precParams = sgPrecParams.sublist("Mean Preconditioner Parameters"); precParams = det_ML; Teuchos::ParameterList& jacobiOpParams = sgPrecParams.sublist("Jacobi SG Operator"); jacobiOpParams.set("Only Use Linear Terms", true); } else if (precMethod == ASC) { sgPrecParams.set("Preconditioner Method", "Approximate Schur Complement"); sgPrecParams.set("Mean Preconditioner Type", "ML"); Teuchos::ParameterList& precParams = sgPrecParams.sublist("Mean Preconditioner Parameters"); precParams = det_ML; } else if (precMethod == KP) { sgPrecParams.set("Preconditioner Method", "Kronecker Product"); sgPrecParams.set("Only Use Linear Terms", true); sgPrecParams.set("Mean Preconditioner Type", "ML"); Teuchos::ParameterList& meanPrecParams = sgPrecParams.sublist("Mean Preconditioner Parameters"); meanPrecParams = det_ML; sgPrecParams.set("G Preconditioner Type", "Ifpack"); Teuchos::ParameterList& GPrecParams = sgPrecParams.sublist("G Preconditioner Parameters"); if (outer_krylov_method == GMRES || outer_krylov_method == FGMRES) GPrecParams.set("Ifpack Preconditioner", "ILUT"); if (outer_krylov_method == CG) GPrecParams.set("Ifpack Preconditioner", "ICT"); GPrecParams.set("Overlap", 1); GPrecParams.set("fact: drop tolerance", 1e-4); GPrecParams.set("fact: ilut level-of-fill", 1.0); GPrecParams.set("schwarz: combine mode", "Add"); } else if (precMethod == NONE) { sgPrecParams.set("Preconditioner Method", "None"); } else TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "Error! Unknown preconditioner method " << precMethod << "." << std::endl); // Create stochastic Galerkin model evaluator Teuchos::RCP<Stokhos::SGModelEvaluator> sg_model = Teuchos::rcp(new Stokhos::SGModelEvaluator(model, basis, Teuchos::null, expansion, sg_parallel_data, sgParams, scaleOP)); EpetraExt::ModelEvaluator::InArgs sg_inArgs = sg_model->createInArgs(); EpetraExt::ModelEvaluator::OutArgs sg_outArgs = sg_model->createOutArgs(); // Set up stochastic parameters Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_p_init = sg_model->create_p_sg(0); for (int i=0; i<num_KL; i++) { sg_p_init->term(i,0)[i] = 0.0; sg_p_init->term(i,1)[i] = 1.0; } sg_model->set_p_sg_init(0, *sg_p_init); // Setup stochastic initial guess Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_x_init = sg_model->create_x_sg(); sg_x_init->init(0.0); sg_model->set_x_sg_init(*sg_x_init); // Create NOX interface Teuchos::RCP<NOX::Epetra::ModelEvaluatorInterface> nox_interface = Teuchos::rcp(new NOX::Epetra::ModelEvaluatorInterface(sg_model)); // Create NOX stochastic linear system object Teuchos::RCP<const Epetra_Vector> u = sg_model->get_x_init(); Teuchos::RCP<const Epetra_Map> base_map = model->get_x_map(); Teuchos::RCP<const Epetra_Map> sg_map = sg_model->get_x_map(); Teuchos::RCP<Epetra_Operator> A = sg_model->create_W(); Teuchos::RCP<NOX::Epetra::Interface::Required> iReq = nox_interface; Teuchos::RCP<NOX::Epetra::Interface::Jacobian> iJac = nox_interface; // Build linear solver Teuchos::RCP<NOX::Epetra::LinearSystem> linsys; if (solve_method==SG_KRYLOV) { bool has_M = sg_outArgs.supports(EpetraExt::ModelEvaluator::OUT_ARG_WPrec); Teuchos::RCP<Epetra_Operator> M; Teuchos::RCP<NOX::Epetra::Interface::Preconditioner> iPrec; if (has_M) { M = sg_model->create_WPrec()->PrecOp; iPrec = nox_interface; } stratParams.set("Preconditioner Type", "None"); if (outer_krylov_solver == AZTECOO) { stratParams.set("Linear Solver Type", "AztecOO"); Teuchos::ParameterList& aztecOOParams = stratParams.sublist("Linear Solver Types").sublist("AztecOO").sublist("Forward Solve"); Teuchos::ParameterList& aztecOOSettings = aztecOOParams.sublist("AztecOO Settings"); if (outer_krylov_method == GMRES) { aztecOOSettings.set("Aztec Solver","GMRES"); } else if (outer_krylov_method == CG) { aztecOOSettings.set("Aztec Solver","CG"); } aztecOOSettings.set("Output Frequency", 1); aztecOOSettings.set("Size of Krylov Subspace", 100); aztecOOParams.set("Max Iterations", outer_its); aztecOOParams.set("Tolerance", outer_tol); stratLinSolParams.set("Preconditioner", "User Defined"); if (has_M) linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos( printParams, stratLinSolParams, iJac, A, iPrec, M, *u, true)); else linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos( printParams, stratLinSolParams, iJac, A, *u)); } else if (outer_krylov_solver == BELOS){ stratParams.set("Linear Solver Type", "Belos"); Teuchos::ParameterList& belosParams = stratParams.sublist("Linear Solver Types").sublist("Belos"); Teuchos::ParameterList* belosSolverParams = NULL; if (outer_krylov_method == GMRES || outer_krylov_method == FGMRES) { belosParams.set("Solver Type","Block GMRES"); belosSolverParams = &(belosParams.sublist("Solver Types").sublist("Block GMRES")); if (outer_krylov_method == FGMRES) belosSolverParams->set("Flexible Gmres", true); } else if (outer_krylov_method == CG) { belosParams.set("Solver Type","Block CG"); belosSolverParams = &(belosParams.sublist("Solver Types").sublist("Block CG")); } else if (inner_krylov_method == RGMRES) { belosParams.set("Solver Type","GCRODR"); belosSolverParams = &(belosParams.sublist("Solver Types").sublist("GCRODR")); } belosSolverParams->set("Convergence Tolerance", outer_tol); belosSolverParams->set("Maximum Iterations", outer_its); belosSolverParams->set("Output Frequency",1); belosSolverParams->set("Output Style",1); belosSolverParams->set("Verbosity",33); stratLinSolParams.set("Preconditioner", "User Defined"); if (has_M) linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos( printParams, stratLinSolParams, iJac, A, iPrec, M, *u, true)); else linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemStratimikos( printParams, stratLinSolParams, iJac, A, *u)); } } else if (solve_method==SG_GS) { lsParams.sublist("Deterministic Solver Parameters") = det_lsParams; lsParams.set("Max Iterations", outer_its); lsParams.set("Tolerance", outer_tol); linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemSGGS( printParams, lsParams, det_linsys, iReq, iJac, basis, sg_parallel_data, A, base_map, sg_map)); } else { lsParams.sublist("Deterministic Solver Parameters") = det_lsParams; lsParams.set("Max Iterations", outer_its); lsParams.set("Tolerance", outer_tol); Teuchos::ParameterList& jacobiOpParams = lsParams.sublist("Jacobi SG Operator"); jacobiOpParams.set("Only Use Linear Terms", true); linsys = Teuchos::rcp(new NOX::Epetra::LinearSystemSGJacobi( printParams, lsParams, det_linsys, iReq, iJac, basis, sg_parallel_data, A, base_map, sg_map)); } // Build NOX group Teuchos::RCP<NOX::Epetra::Group> grp = Teuchos::rcp(new NOX::Epetra::Group(printParams, iReq, *u, linsys)); // Create the Solver convergence test Teuchos::RCP<NOX::StatusTest::Generic> statusTests = NOX::StatusTest::buildStatusTests(statusParams, utils); // Create the solver Teuchos::RCP<NOX::Solver::Generic> solver = NOX::Solver::buildSolver(grp, statusTests, noxParams); // Solve the system NOX::StatusTest::StatusType status; { TEUCHOS_FUNC_TIME_MONITOR("Total Solve Time"); status = solver->solve(); } // Get final solution const NOX::Epetra::Group& finalGroup = dynamic_cast<const NOX::Epetra::Group&>(solver->getSolutionGroup()); const Epetra_Vector& finalSolution = (dynamic_cast<const NOX::Epetra::Vector&>(finalGroup.getX())).getEpetraVector(); // Save final solution to file EpetraExt::VectorToMatrixMarketFile("nox_solver_stochastic_solution.mm", finalSolution); // Save mean and variance to file Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_x_poly = sg_model->create_x_sg(View, &finalSolution); Epetra_Vector mean(*(model->get_x_map())); Epetra_Vector std_dev(*(model->get_x_map())); sg_x_poly->computeMean(mean); sg_x_poly->computeStandardDeviation(std_dev); EpetraExt::VectorToMatrixMarketFile("mean_gal.mm", mean); EpetraExt::VectorToMatrixMarketFile("std_dev_gal.mm", std_dev); // Evaluate SG responses at SG parameters Teuchos::RCP<const Epetra_Vector> sg_p = sg_model->get_p_init(1); Teuchos::RCP<Epetra_Vector> sg_g = Teuchos::rcp(new Epetra_Vector(*(sg_model->get_g_map(0)))); sg_inArgs.set_p(1, sg_p); sg_inArgs.set_x(Teuchos::rcp(&finalSolution,false)); sg_outArgs.set_g(0, sg_g); sg_model->evalModel(sg_inArgs, sg_outArgs); // Print mean and standard deviation of response Teuchos::RCP<Stokhos::EpetraVectorOrthogPoly> sg_g_poly = sg_model->create_g_sg(0, View, sg_g.get()); Epetra_Vector g_mean(*(model->get_g_map(0))); Epetra_Vector g_std_dev(*(model->get_g_map(0))); sg_g_poly->computeMean(g_mean); sg_g_poly->computeStandardDeviation(g_std_dev); std::cout.precision(16); // std::cout << "\nResponse Expansion = " << std::endl; // std::cout.precision(12); // sg_g_poly->print(std::cout); std::cout << "\nResponse Mean = " << std::endl << g_mean << std::endl; std::cout << "Response Std. Dev. = " << std::endl << g_std_dev << std::endl; if (status == NOX::StatusTest::Converged && MyPID == 0) utils.out() << "Example Passed!" << std::endl; } Teuchos::TimeMonitor::summarize(std::cout); Teuchos::TimeMonitor::zeroOutTimers(); } catch (std::exception& e) { std::cout << e.what() << std::endl; } catch (string& s) { std::cout << s << std::endl; } catch (char *s) { std::cout << s << std::endl; } catch (...) { std::cout << "Caught unknown exception!" <<std:: endl; } #ifdef HAVE_MPI MPI_Finalize() ; #endif }
int main(int argc, char *argv[]) { bool success = true; bool verbose = false; try { Teuchos::oblackholestream blackHole; Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole); Teuchos::RCP<const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm(); const size_t num_sockets = Kokkos::hwloc::get_available_numa_count(); const size_t num_cores_per_socket = Kokkos::hwloc::get_available_cores_per_numa(); const size_t num_threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This test performance of MP::Vector FEM assembly.\n"); int nGrid = 32; CLP.setOption("n", &nGrid, "Number of mesh points in the each direction"); int nIter = 10; CLP.setOption("ni", &nIter, "Number of assembly iterations"); bool print = false; CLP.setOption("print", "no-print", &print, "Print debugging output"); bool check = false; int num_cores = num_cores_per_socket * num_sockets; CLP.setOption("cores", &num_cores, "Number of CPU cores to use (defaults to all)"); int num_hyper_threads = num_threads_per_core; CLP.setOption("hyperthreads", &num_hyper_threads, "Number of hyper threads per core to use (defaults to all)"); int threads_per_vector = 1; CLP.setOption("threads_per_vector", &threads_per_vector, "Number of threads to use within each vector"); CLP.setOption("check", "no-check", &check, "Check correctness"); #ifdef KOKKOS_HAVE_SERIAL bool serial = true; CLP.setOption("serial", "no-serial", &serial, "Enable Serial device"); #endif #ifdef KOKKOS_HAVE_PTHREAD bool threads = true; CLP.setOption("threads", "no-threads", &threads, "Enable Threads device"); #endif #ifdef KOKKOS_HAVE_OPENMP bool openmp = true; CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device"); #endif #ifdef KOKKOS_HAVE_CUDA bool cuda = true; CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device"); int cuda_threads_per_vector = 16; CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector, "Number of Cuda threads to use within each vector"); int cuda_block_size = 256; CLP.setOption("cuda_block_size", &cuda_block_size, "Cuda block size"); int num_cuda_blocks = 0; CLP.setOption("num_cuda_blocks", &num_cuda_blocks, "Number of Cuda blocks (0 implies the default choice)"); int device_id = -1; CLP.setOption("device", &device_id, "CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus"); int ngpus = 1; CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI"); #endif CLP.parse( argc, argv ); int use_nodes[3]; use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid; typedef int Ordinal; typedef double Scalar; const Kokkos::Example::FENL::AssemblyMethod Method = Kokkos::Example::FENL::FadElementOptimized; // const Kokkos::Example::FENL::AssemblyMethod Method = // Kokkos::Example::FENL::Analytic; #ifdef KOKKOS_HAVE_SERIAL if (serial) { typedef Kokkos::Serial Device; typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage; Kokkos::Serial::initialize(); if (comm->getRank() == 0) std::cout << std::endl << "Serial performance with " << comm->getSize() << " MPI ranks" << std::endl; Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1); mainHost<Storage,Method>(comm, print, nIter, use_nodes, check, dev_config); Kokkos::Serial::finalize(); } #endif #ifdef KOKKOS_HAVE_PTHREAD if (threads) { typedef Kokkos::Threads Device; typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage; Kokkos::Threads::initialize(num_cores*num_hyper_threads); if (comm->getRank() == 0) std::cout << std::endl << "Threads performance with " << comm->getSize() << " MPI ranks and " << num_cores*num_hyper_threads << " threads per rank:" << std::endl; Kokkos::Example::FENL::DeviceConfig dev_config(num_cores, threads_per_vector, num_hyper_threads / threads_per_vector); mainHost<Storage,Method>(comm, print, nIter, use_nodes, check, dev_config); Kokkos::Threads::finalize(); } #endif #ifdef KOKKOS_HAVE_OPENMP if (openmp) { typedef Kokkos::OpenMP Device; typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage; Kokkos::OpenMP::initialize(num_cores*num_hyper_threads); if (comm->getRank() == 0) std::cout << std::endl << "OpenMP performance with " << comm->getSize() << " MPI ranks and " << num_cores*num_hyper_threads << " threads per rank:" << std::endl; Kokkos::Example::FENL::DeviceConfig dev_config(num_cores, threads_per_vector, num_hyper_threads / threads_per_vector); mainHost<Storage,Method>(comm, print, nIter, use_nodes, check, dev_config); Kokkos::OpenMP::finalize(); } #endif #ifdef KOKKOS_HAVE_CUDA if (cuda) { typedef Kokkos::Cuda Device; typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage; if (device_id == -1) { int local_rank = 0; char *str; if ((str = std::getenv("SLURM_LOCALID"))) local_rank = std::atoi(str); else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"))) local_rank = std::atoi(str); else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) local_rank = std::atoi(str); device_id = local_rank % ngpus; // Check device is valid int num_device; cudaGetDeviceCount(&num_device); TEUCHOS_TEST_FOR_EXCEPTION( device_id >= num_device, std::logic_error, "Invalid device ID " << device_id << ". You probably are trying" << " to run with too many GPUs per node"); } Kokkos::HostSpace::execution_space::initialize(); Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id)); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device_id); if (comm->getRank() == 0) std::cout << std::endl << "CUDA performance performance with " << comm->getSize() << " MPI ranks and device " << device_id << " (" << deviceProp.name << "):" << std::endl; Kokkos::Example::FENL::DeviceConfig dev_config( num_cuda_blocks, cuda_threads_per_vector, cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector); mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check, dev_config); Kokkos::HostSpace::execution_space::finalize(); Kokkos::Cuda::finalize(); } #endif } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }
int main(int argc, char *argv[]) { typedef double MeshScalar; typedef double BasisScalar; typedef Tpetra::DefaultPlatform::DefaultPlatformType::NodeType Node; typedef Teuchos::ScalarTraits<Scalar>::magnitudeType magnitudeType; //double g_mean_exp = 1.906587e-01; // expected response mean //double g_std_dev_exp = 8.680605e-02; // expected response std. dev. //double g_tol = 1e-6; // tolerance on determining success using Teuchos::RCP; using Teuchos::rcp; using Teuchos::Array; using Teuchos::ArrayRCP; using Teuchos::ArrayView; using Teuchos::ParameterList; // Initialize MPI #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // feenableexcept(FE_ALL_EXCEPT); LocalOrdinal MyPID; try { // Create a communicator for Epetra objects RCP<const Epetra_Comm> globalComm; #ifdef HAVE_MPI globalComm = rcp(new Epetra_MpiComm(MPI_COMM_WORLD)); #else globalComm = rcp(new Epetra_SerialComm); #endif MyPID = globalComm->MyPID(); // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example runs an interlaced stochastic Galerkin solvers.\n"); int n = 32; CLP.setOption("num_mesh", &n, "Number of mesh points in each direction"); // multigrid specific options int minAggSize = 1; CLP.setOption("min_agg_size", &minAggSize, "multigrid aggregate size"); int smootherSweeps = 3; CLP.setOption("smoother_sweeps", &smootherSweeps, "# multigrid smoother sweeps"); int plainAgg=1; CLP.setOption("plain_aggregation", &plainAgg, "plain aggregation"); LocalOrdinal nsSize=-1; CLP.setOption("nullspace_size", &nsSize, "nullspace dimension"); bool symmetric = false; CLP.setOption("symmetric", "unsymmetric", &symmetric, "Symmetric discretization"); int num_spatial_procs = -1; CLP.setOption("num_spatial_procs", &num_spatial_procs, "Number of spatial processors (set -1 for all available procs)"); SG_RF randField = UNIFORM; CLP.setOption("rand_field", &randField, num_sg_rf, sg_rf_values, sg_rf_names, "Random field type"); double mu = 0.2; CLP.setOption("mean", &mu, "Mean"); double s = 0.1; CLP.setOption("std_dev", &s, "Standard deviation"); int num_KL = 2; CLP.setOption("num_kl", &num_KL, "Number of KL terms"); int order = 3; CLP.setOption("order", &order, "Polynomial order"); bool normalize_basis = true; CLP.setOption("normalize", "unnormalize", &normalize_basis, "Normalize PC basis"); Krylov_Method solver_method = GMRES; CLP.setOption("solver_method", &solver_method, num_krylov_method, krylov_method_values, krylov_method_names, "Krylov solver method"); SG_Prec prec_method = STOCHASTIC; CLP.setOption("prec_method", &prec_method, num_sg_prec, sg_prec_values, sg_prec_names, "Preconditioner method"); SG_Div division_method = DIRECT; CLP.setOption("division_method", &division_method, num_sg_div, sg_div_values, sg_div_names, "Stochastic division method"); SG_DivPrec divprec_method = NO; CLP.setOption("divprec_method", &divprec_method, num_sg_divprec, sg_divprec_values, sg_divprec_names, "Preconditioner for division method"); Schur_option schur_option = diag; CLP.setOption("schur_option", &schur_option, num_schur_option, Schur_option_values, schur_option_names, "Schur option"); Prec_option prec_option = whole; CLP.setOption("prec_option", &prec_option, num_prec_option, Prec_option_values, prec_option_names, "Prec option"); double solver_tol = 1e-12; CLP.setOption("solver_tol", &solver_tol, "Outer solver tolerance"); double div_tol = 1e-6; CLP.setOption("div_tol", &div_tol, "Tolerance in Iterative Solver"); int prec_level = 1; CLP.setOption("prec_level", &prec_level, "Level in Schur Complement Prec 0->Solve A0u0=g0 with division; 1->Form 1x1 Schur Complement"); int max_it_div = 50; CLP.setOption("max_it_div", &max_it_div, "Maximum # of Iterations in Iterative Solver for Division"); bool equilibrate = true; //JJH 8/26/12 changing to true to match ETP example CLP.setOption("equilibrate", "noequilibrate", &equilibrate, "Equilibrate the linear system"); CLP.parse( argc, argv ); if (MyPID == 0) { std::cout << "Summary of command line options:" << std::endl << "\tnum_mesh = " << n << std::endl << "\tsymmetric = " << symmetric << std::endl << "\tnum_spatial_procs = " << num_spatial_procs << std::endl << "\trand_field = " << sg_rf_names[randField] << std::endl << "\tmean = " << mu << std::endl << "\tstd_dev = " << s << std::endl << "\tnum_kl = " << num_KL << std::endl << "\torder = " << order << std::endl << "\tnormalize_basis = " << normalize_basis << std::endl << "\tsolver_method = " << krylov_method_names[solver_method] << std::endl << "\tprec_method = " << sg_prec_names[prec_method] << std::endl << "\tdivision_method = " << sg_div_names[division_method] << std::endl << "\tdiv_tol = " << div_tol << std::endl << "\tdiv_prec = " << sg_divprec_names[divprec_method] << std::endl << "\tprec_level = " << prec_level << std::endl << "\tmax_it_div = " << max_it_div << std::endl; } bool nonlinear_expansion = false; if (randField == UNIFORM) nonlinear_expansion = false; else if (randField == LOGNORMAL) nonlinear_expansion = true; { TEUCHOS_FUNC_TIME_MONITOR("Total PCE Calculation Time"); // Create Stochastic Galerkin basis and expansion Teuchos::Array< RCP<const Stokhos::OneDOrthogPolyBasis<LocalOrdinal,BasisScalar> > > bases(num_KL); for (LocalOrdinal i=0; i<num_KL; i++) if (randField == UNIFORM) bases[i] = rcp(new Stokhos::LegendreBasis<LocalOrdinal,BasisScalar>(order, normalize_basis)); else if (randField == LOGNORMAL) bases[i] = rcp(new Stokhos::HermiteBasis<int,double>(order, normalize_basis)); RCP<const Stokhos::CompletePolynomialBasis<LocalOrdinal,BasisScalar> > basis = rcp(new Stokhos::CompletePolynomialBasis<LocalOrdinal,BasisScalar>(bases, 1e-12)); LocalOrdinal sz = basis->size(); RCP<Stokhos::Sparse3Tensor<LocalOrdinal,BasisScalar> > Cijk = basis->computeTripleProductTensor(sz); RCP<const Stokhos::Quadrature<int,double> > quad = rcp(new Stokhos::TensorProductQuadrature<int,double>(basis)); RCP<ParameterList> expn_params = Teuchos::rcp(new ParameterList); if (division_method == MEAN_DIV) { expn_params->set("Division Strategy", "Mean-Based"); expn_params->set("Use Quadrature for Division", false); } else if (division_method == DIRECT) { expn_params->set("Division Strategy", "Dense Direct"); expn_params->set("Use Quadrature for Division", false); } else if (division_method == SPD_DIRECT) { expn_params->set("Division Strategy", "SPD Dense Direct"); expn_params->set("Use Quadrature for Division", false); } else if (division_method == CGD) { expn_params->set("Division Strategy", "CG"); expn_params->set("Use Quadrature for Division", false); } else if (division_method == QUAD) { expn_params->set("Use Quadrature for Division", true); } if (divprec_method == NO) expn_params->set("Prec Strategy", "None"); else if (divprec_method == DIAG) expn_params->set("Prec Strategy", "Diag"); else if (divprec_method == JACOBI) expn_params->set("Prec Strategy", "Jacobi"); else if (divprec_method == GS) expn_params->set("Prec Strategy", "GS"); else if (divprec_method == SCHUR) expn_params->set("Prec Strategy", "Schur"); if (schur_option == diag) expn_params->set("Schur option", "diag"); else expn_params->set("Schur option", "full"); if (prec_option == linear) expn_params->set("Prec option", "linear"); if (equilibrate) expn_params->set("Equilibrate", 1); else expn_params->set("Equilibrate", 0); expn_params->set("Division Tolerance", div_tol); expn_params->set("prec_iter", prec_level); expn_params->set("max_it_div", max_it_div); RCP<Stokhos::OrthogPolyExpansion<LocalOrdinal,BasisScalar> > expansion = rcp(new Stokhos::QuadOrthogPolyExpansion<LocalOrdinal,BasisScalar>( basis, Cijk, quad, expn_params)); if (MyPID == 0) std::cout << "Stochastic Galerkin expansion size = " << sz << std::endl; // Create stochastic parallel distribution ParameterList parallelParams; parallelParams.set("Number of Spatial Processors", num_spatial_procs); // parallelParams.set("Rebalance Stochastic Graph", true); // Teuchos::ParameterList& isorropia_params = // parallelParams.sublist("Isorropia"); // isorropia_params.set("Balance objective", "nonzeros"); RCP<Stokhos::ParallelData> sg_parallel_data = rcp(new Stokhos::ParallelData(basis, Cijk, globalComm, parallelParams)); RCP<const EpetraExt::MultiComm> sg_comm = sg_parallel_data->getMultiComm(); RCP<const Epetra_Comm> app_comm = sg_parallel_data->getSpatialComm(); // Create Teuchos::Comm from Epetra_Comm RCP< Teuchos::Comm<int> > teuchos_app_comm; #ifdef HAVE_MPI RCP<const Epetra_MpiComm> app_mpi_comm = Teuchos::rcp_dynamic_cast<const Epetra_MpiComm>(app_comm); RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > raw_mpi_comm = Teuchos::opaqueWrapper(app_mpi_comm->Comm()); teuchos_app_comm = rcp(new Teuchos::MpiComm<int>(raw_mpi_comm)); #else teuchos_app_comm = rcp(new Teuchos::SerialComm<int>()); #endif // Create application typedef twoD_diffusion_problem<Scalar,MeshScalar,BasisScalar,LocalOrdinal,GlobalOrdinal,Node> problem_type; RCP<problem_type> model = rcp(new problem_type(teuchos_app_comm, n, num_KL, s, mu, nonlinear_expansion, symmetric)); // Create vectors and operators typedef problem_type::Tpetra_Vector Tpetra_Vector; typedef problem_type::Tpetra_CrsMatrix Tpetra_CrsMatrix; typedef Tpetra::MatrixMarket::Writer<Tpetra_CrsMatrix> Writer; //Xpetra matrices typedef Xpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_CrsMatrix; typedef Xpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> Xpetra_MultiVector; typedef Xpetra::MultiVectorFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node> Xpetra_MultiVectorFactory; typedef Xpetra::Operator<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_Operator; typedef Xpetra::TpetraCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_TpetraCrsMatrix; typedef Xpetra::CrsOperator<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Xpetra_CrsOperator; typedef Belos::MueLuOp<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> Belos_MueLuOperator; //MueLu typedefs typedef MueLu::Hierarchy<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> MueLu_Hierarchy; typedef MueLu::SmootherPrototype<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> SmootherPrototype; typedef MueLu::TrilinosSmoother<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> TrilinosSmoother; typedef MueLu::SmootherFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> SmootherFactory; typedef MueLu::FactoryManager<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> FactoryManager; RCP<Tpetra_Vector> p = Tpetra::createVector<Scalar>(model->get_p_map(0)); RCP<Tpetra_Vector> x = Tpetra::createVector<Scalar>(model->get_x_map()); x->putScalar(0.0); RCP<Tpetra_Vector> f = Tpetra::createVector<Scalar>(model->get_f_map()); RCP<Tpetra_Vector> dx = Tpetra::createVector<Scalar>(model->get_x_map()); RCP<Tpetra_CrsMatrix> J = model->create_W(); RCP<Tpetra_CrsMatrix> J0; if (prec_method == MEAN) J0 = model->create_W(); // Set PCE expansion of p p->putScalar(0.0); ArrayRCP<Scalar> p_view = p->get1dViewNonConst(); for (ArrayRCP<Scalar>::size_type i=0; i<p_view.size(); i++) { p_view[i].reset(expansion); p_view[i].copyForWrite(); } Array<double> point(num_KL, 1.0); Array<double> basis_vals(sz); basis->evaluateBases(point, basis_vals); if (order > 0) { for (int i=0; i<num_KL; i++) { p_view[i].term(i,1) = 1.0 / basis_vals[i+1]; } } // Create preconditioner typedef Ifpack2::Preconditioner<Scalar,LocalOrdinal,GlobalOrdinal,Node> Tprec; RCP<Belos_MueLuOperator> M; RCP<MueLu_Hierarchy> H; RCP<Xpetra_CrsMatrix> xcrsJ = rcp(new Xpetra_TpetraCrsMatrix(J)); RCP<Xpetra_Operator> xopJ = rcp(new Xpetra_CrsOperator(xcrsJ)); if (prec_method != NONE) { ParameterList precParams; std::string prec_name = "RILUK"; precParams.set("fact: iluk level-of-fill", 1); precParams.set("fact: iluk level-of-overlap", 0); //Ifpack2::Factory factory; RCP<Xpetra_Operator> xopJ0; if (prec_method == MEAN) { RCP<Xpetra_CrsMatrix> xcrsJ0 = rcp(new Xpetra_TpetraCrsMatrix(J0)); xopJ0 = rcp(new Xpetra_CrsOperator(xcrsJ0)); //M = factory.create<Tpetra_CrsMatrix>(prec_name, J0); } else if (prec_method == STOCHASTIC) { xopJ0 = xopJ; //M = factory.create<Tpetra_CrsMatrix>(prec_name, J); } H = rcp(new MueLu_Hierarchy(xopJ0)); M = rcp(new Belos_MueLuOperator(H)); //M->setParameters(precParams); if (nsSize!=-1) sz=nsSize; RCP<Xpetra_MultiVector> Z = Xpetra_MultiVectorFactory::Build(xcrsJ->getDomainMap(), sz); size_t n = Z->getLocalLength(); for (LocalOrdinal j=0; j<sz; ++j) { ArrayRCP<Scalar> col = Z->getDataNonConst(j); for (size_t i=0; i<n; ++i) { col[i].reset(expansion); col[i].copyForWrite(); col[i].fastAccessCoeff(j) = 1.0; } } H->GetLevel(0)->Set("Nullspace", Z); //RCP<Teuchos::FancyOStream> fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout)); //fos->setOutputToRootOnly(-1); //Z->describe(*fos); } // Evaluate model model->computeResidual(*x, *p, *f); model->computeJacobian(*x, *p, *J); // Compute mean for mean-based preconditioner if (prec_method == MEAN) { size_t nrows = J->getNodeNumRows(); ArrayView<const LocalOrdinal> indices; ArrayView<const Scalar> values; J0->resumeFill(); for (size_t i=0; i<nrows; i++) { J->getLocalRowView(i, indices, values); Array<Scalar> values0(values.size()); for (LocalOrdinal j=0; j<values.size(); j++) values0[j] = values[j].coeff(0); J0->replaceLocalValues(i, indices, values0); } J0->fillComplete(); } // compute preconditioner if (prec_method != NONE) { //M->initialize(); //M->compute(); //override MueLu defaults via factory manager RCP<FactoryManager> fm = rcp( new FactoryManager() );; //smoother ParameterList smootherParamList; /* smootherParamList.set("chebyshev: degree", smootherSweeps); smootherParamList.set("chebyshev: ratio eigenvalue", (double) 20); smootherParamList.set("chebyshev: max eigenvalue", (double) -1.0); smootherParamList.set("chebyshev: min eigenvalue", (double) 1.0); smootherParamList.set("chebyshev: zero starting solution", true); RCP<SmootherPrototype> smooPrototype = rcp( new TrilinosSmoother("CHEBYSHEV", smootherParamList) ); */ smootherParamList.set("relaxation: sweeps", smootherSweeps); smootherParamList.set("relaxation: type", "Symmetric Gauss-Seidel"); RCP<SmootherPrototype> smooPrototype = rcp( new TrilinosSmoother("RELAXATION", smootherParamList) ); RCP<SmootherFactory> smooFact = rcp( new SmootherFactory(smooPrototype) ); fm->SetFactory("Smoother", smooFact); // coarse level solve ParameterList coarseParamList; coarseParamList.set("fact: level-of-fill", 0); RCP<SmootherPrototype> coarsePrototype = rcp( new TrilinosSmoother("ILUT", coarseParamList) ); RCP<SmootherFactory> coarseSolverFact = rcp( new SmootherFactory(coarsePrototype, Teuchos::null) ); fm->SetFactory("CoarseSolver", coarseSolverFact); //allow for larger aggregates typedef MueLu::UCAggregationFactory<LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MueLu_UCAggregationFactory; RCP<MueLu_UCAggregationFactory> aggFact = rcp(new MueLu_UCAggregationFactory()); aggFact->SetMinNodesPerAggregate(minAggSize); fm->SetFactory("Aggregates", aggFact); //turn off damping typedef MueLu::SaPFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalMatOps> MueLu_SaPFactory; if (plainAgg) { RCP<MueLu_SaPFactory> sapFactory = rcp(new MueLu_SaPFactory); sapFactory->SetDampingFactor( (Scalar) 0.0 ); fm->SetFactory("P", sapFactory); } H->Setup(*fm); } // Setup Belos solver RCP<ParameterList> belosParams = rcp(new ParameterList); belosParams->set("Flexible Gmres", false); belosParams->set("Num Blocks", 500);//20 belosParams->set("Convergence Tolerance", solver_tol); belosParams->set("Maximum Iterations", 1000); belosParams->set("Verbosity", 33); belosParams->set("Output Style", 1); belosParams->set("Output Frequency", 1); typedef Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> MV; typedef Belos::OperatorT<Tpetra::MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Node> > OP; typedef Belos::OperatorTraits<Scalar,MV,OP> BOPT; typedef Belos::MultiVecTraits<Scalar,MV> BMVT; typedef Belos::MultiVecTraits<double,MV> BTMVT; typedef Belos::LinearProblem<double,MV,OP> BLinProb; typedef Belos::XpetraOp<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalMatOps> BXpetraOp; RCP<OP> belosJ = rcp(new BXpetraOp(xopJ)); // Turns an Xpetra::Operator object into a Belos operator RCP< BLinProb > problem = rcp(new BLinProb(belosJ, dx, f)); if (prec_method != NONE) problem->setRightPrec(M); problem->setProblem(); RCP<Belos::SolverManager<double,MV,OP> > solver; if (solver_method == CG) solver = rcp(new Belos::PseudoBlockCGSolMgr<double,MV,OP>(problem, belosParams)); else if (solver_method == GMRES) solver = rcp(new Belos::BlockGmresSolMgr<double,MV,OP>(problem, belosParams)); // Print initial residual norm std::vector<double> norm_f(1); //BMVT::MvNorm(*f, norm_f); BTMVT::MvNorm(*f, norm_f); if (MyPID == 0) std::cout << "\nInitial residual norm = " << norm_f[0] << std::endl; // Solve linear system Belos::ReturnType ret = solver->solve(); if (MyPID == 0) { if (ret == Belos::Converged) std::cout << "Solver converged!" << std::endl; else std::cout << "Solver failed to converge!" << std::endl; } // Update x x->update(-1.0, *dx, 1.0); Writer::writeDenseFile("stochastic_solution.mm", x); // Compute new residual & response function RCP<Tpetra_Vector> g = Tpetra::createVector<Scalar>(model->get_g_map(0)); f->putScalar(0.0); model->computeResidual(*x, *p, *f); model->computeResponse(*x, *p, *g); // Print final residual norm //BMVT::MvNorm(*f, norm_f); BTMVT::MvNorm(*f, norm_f); if (MyPID == 0) std::cout << "\nFinal residual norm = " << norm_f[0] << std::endl; // Print response std::cout << "\nResponse = " << std::endl; //Writer::writeDense(std::cout, g); Writer::writeDenseFile("stochastic_residual.mm", f); /* double g_mean = g->get1dView()[0].mean(); double g_std_dev = g->get1dView()[0].standard_deviation(); std::cout << "g mean = " << g_mean << std::endl; std::cout << "g std_dev = " << g_std_dev << std::endl; bool passed = false; if (norm_f[0] < 1.0e-10 && std::abs(g_mean-g_mean_exp) < g_tol && std::abs(g_std_dev - g_std_dev_exp) < g_tol) passed = true; if (MyPID == 0) { if (passed) std::cout << "Example Passed!" << std::endl; else{ std::cout << "Example Failed!" << std::endl; std::cout << "expected g_mean = "<< g_mean_exp << std::endl; std::cout << "expected g_std_dev = "<< g_std_dev_exp << std::endl; } } */ } Teuchos::TimeMonitor::summarize(std::cout); Teuchos::TimeMonitor::zeroOutTimers(); } catch (std::exception& e) { std::cout << e.what() << std::endl; } catch (string& s) { std::cout << s << std::endl; } catch (char *s) { std::cout << s << std::endl; } catch (...) { std::cout << "Caught unknown exception!" <<std:: endl; } #ifdef HAVE_MPI MPI_Finalize() ; #endif }
int main(int argc, char *argv[]) { int np=1, rank=0; int splitrank, splitsize; int rc = 0; nssi_service xfer_svc; int server_index=0; int rank_in_server=0; int transport_index=-1; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Barrier(MPI_COMM_WORLD); Teuchos::oblackholestream blackhole; std::ostream &out = ( rank == 0 ? std::cout : blackhole ); struct xfer_args args; const int num_io_methods = 8; const int io_method_vals[] = { XFER_WRITE_ENCODE_SYNC, XFER_WRITE_ENCODE_ASYNC, XFER_WRITE_RDMA_SYNC, XFER_WRITE_RDMA_ASYNC, XFER_READ_ENCODE_SYNC, XFER_READ_ENCODE_ASYNC, XFER_READ_RDMA_SYNC, XFER_READ_RDMA_ASYNC}; const char * io_method_names[] = { "write-encode-sync", "write-encode-async", "write-rdma-sync", "write-rdma-async", "read-encode-sync", "read-encode-async", "read-rdma-sync", "read-rdma-async"}; const int nssi_transport_list[] = { NSSI_RPC_PTL, NSSI_RPC_PTL, NSSI_RPC_IB, NSSI_RPC_IB, NSSI_RPC_GEMINI, NSSI_RPC_GEMINI, NSSI_RPC_BGPDCMF, NSSI_RPC_BGPDCMF, NSSI_RPC_BGQPAMI, NSSI_RPC_BGQPAMI, NSSI_RPC_MPI}; const int num_nssi_transports = 11; const int nssi_transport_vals[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; const char * nssi_transport_names[] = { "portals", "ptl", "infiniband", "ib", "gemini", "gni", "bgpdcmf", "dcmf", "bgqpami", "pami", "mpi" }; // Initialize arguments args.transport=NSSI_DEFAULT_TRANSPORT; args.len = 1; args.delay = 1; args.io_method = XFER_WRITE_RDMA_SYNC; args.debug_level = LOG_WARN; args.num_trials = 1; args.num_reqs = 1; args.result_file_mode = "a"; args.result_file = ""; args.url_file = ""; args.logfile = ""; args.client_flag = true; args.server_flag = true; args.num_servers = 1; args.num_threads = 0; args.timeout = 500; args.num_retries = 5; args.validate_flag = true; args.kill_server_flag = true; args.block_distribution = true; bool success = true; /** * We make extensive use of the \ref Teuchos::CommandLineProcessor for command-line * options to control the behavior of the test code. To evaluate performance, * the "num-trials", "num-reqs", and "len" options control the amount of data transferred * between client and server. The "io-method" selects the type of data transfer. The * server-url specifies the URL of the server. If running as a server, the server-url * provides a recommended URL when initializing the network transport. */ try { //out << Teuchos::Teuchos_Version() << std::endl << std::endl; // Creating an empty command line processor looks like: Teuchos::CommandLineProcessor parser; parser.setDocString( "This example program demonstrates a simple data-transfer service " "built using the NEtwork Scalable Service Interface (Nessie)." ); /* To set and option, it must be given a name and default value. Additionally, each option can be given a help std::string. Although it is not necessary, a help std::string aids a users comprehension of the acceptable command line arguments. Some examples of setting command line options are: */ parser.setOption("delay", &args.delay, "time(s) for client to wait for server to start" ); parser.setOption("timeout", &args.timeout, "time(ms) to wait for server to respond" ); parser.setOption("server", "no-server", &args.server_flag, "Run the server" ); parser.setOption("client", "no-client", &args.client_flag, "Run the client"); parser.setOption("len", &args.len, "The number of structures in an input buffer"); parser.setOption("debug",(int*)(&args.debug_level), "Debug level"); parser.setOption("logfile", &args.logfile, "log file"); parser.setOption("num-trials", &args.num_trials, "Number of trials (experiments)"); parser.setOption("num-reqs", &args.num_reqs, "Number of reqs/trial"); parser.setOption("result-file", &args.result_file, "Where to store results"); parser.setOption("result-file-mode", &args.result_file_mode, "Write mode for the result"); parser.setOption("server-url-file", &args.url_file, "File that has URL client uses to find server"); parser.setOption("validate", "no-validate", &args.validate_flag, "Validate the data"); parser.setOption("num-servers", &args.num_servers, "Number of server processes"); parser.setOption("num-threads", &args.num_threads, "Number of threads used by each server process"); parser.setOption("kill-server", "no-kill-server", &args.kill_server_flag, "Kill the server at the end of the experiment"); parser.setOption("block-distribution", "rr-distribution", &args.block_distribution, "Use a block distribution scheme to assign clients to servers"); // Set an enumeration command line option for the io_method parser.setOption("io-method", &args.io_method, num_io_methods, io_method_vals, io_method_names, "I/O Methods for the example: \n" "\t\t\twrite-encode-sync : Write data through the RPC args, synchronous\n" "\t\t\twrite-encode-async: Write data through the RPC args - asynchronous\n" "\t\t\twrite-rdma-sync : Write data using RDMA (server pulls) - synchronous\n" "\t\t\twrite-rdma-async: Write data using RDMA (server pulls) - asynchronous\n" "\t\t\tread-encode-sync : Read data through the RPC result - synchronous\n" "\t\t\tread-encode-async: Read data through the RPC result - asynchronous\n" "\t\t\tread-rdma-sync : Read data using RDMA (server puts) - synchronous\n" "\t\t\tread-rdma-async: Read data using RDMA (server puts) - asynchronous"); // Set an enumeration command line option for the NNTI transport parser.setOption("transport", &transport_index, num_nssi_transports, nssi_transport_vals, nssi_transport_names, "NSSI transports (not all are available on every platform): \n" "\t\t\tportals|ptl : Cray or Schutt\n" "\t\t\tinfiniband|ib : libibverbs\n" "\t\t\tgemini|gni : Cray libugni (Gemini or Aries)\n" "\t\t\tbgpdcmf|dcmf : IBM BG/P DCMF\n" "\t\t\tbgqpami|pami : IBM BG/Q PAMI\n" "\t\t\tmpi : isend/irecv implementation\n" ); /* There are also two methods that control the behavior of the command line processor. First, for the command line processor to allow an unrecognized a command line option to be ignored (and only have a warning printed), use: */ parser.recogniseAllOptions(true); /* Second, by default, if the parser finds a command line option it doesn't recognize or finds the --help option, it will throw an std::exception. If you want prevent a command line processor from throwing an std::exception (which is important in this program since we don't have an try/catch around this) when it encounters a unrecognized option or help is printed, use: */ parser.throwExceptions(false); /* We now parse the command line where argc and argv are passed to the parse method. Note that since we have turned off std::exception throwing above we had better grab the return argument so that we can see what happened and act accordingly. */ Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv ); if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) { return 0; } if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) { return 1; // Error! } // Here is where you would use these command line arguments but for this example program // we will just print the help message with the new values of the command-line arguments. //if (rank == 0) // out << "\nPrinting help message with new values of command-line arguments ...\n\n"; //parser.printHelpMessage(argv[0],out); } TEUCHOS_STANDARD_CATCH_STATEMENTS(true,std::cerr,success); log_debug(args.debug_level, "transport_index=%d", transport_index); if (transport_index > -1) { args.transport =nssi_transport_list[transport_index]; args.transport_name=std::string(nssi_transport_names[transport_index]); } args.io_method_name=std::string(io_method_names[args.io_method]); log_debug(args.debug_level, "%d: Finished processing arguments", rank); if (!success) { MPI_Abort(MPI_COMM_WORLD, 1); } if (!args.server_flag && args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.client.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } else if (args.server_flag && !args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.server.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } else if (args.server_flag && args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } log_level debug_level = args.debug_level; // Communicator used for both client and server (may split if using client and server) MPI_Comm comm; log_debug(debug_level, "%d: Starting xfer-service test", rank); #ifdef TRIOS_ENABLE_COMMSPLITTER if (args.transport == NSSI_RPC_MPI) { MPI_Pcontrol(0); } #endif /** * Since this test can be run as a server, client, or both, we need to play some fancy * MPI games to get the communicators working correctly. If we're executing as both * a client and a server, we split the communicator so that the client thinks its * running by itself. */ int color = 0; // color=0-->server, color=1-->client if (args.client_flag && args.server_flag) { if (np < 2) { log_error(debug_level, "Must use at least 2 MPI processes for client and server mode"); MPI_Abort(MPI_COMM_WORLD, -1); } // Split the communicators. Put all the servers as the first ranks. if (rank < args.num_servers) { color = 0; log_debug(debug_level, "rank=%d is a server", rank); } else { color = 1; // all others are clients log_debug(debug_level, "rank=%d is a client", rank); } MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } else { if (args.client_flag) { color=1; log_debug(debug_level, "rank=%d is a client", rank); } else if (args.server_flag) { color=0; log_debug(debug_level, "rank=%d is a server", rank); } else { log_error(debug_level, "Must be either a client or a server"); MPI_Abort(MPI_COMM_WORLD, -1); } MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); } MPI_Comm_rank(comm, &splitrank); MPI_Comm_size(comm, &splitsize); log_debug(debug_level, "%d: Finished splitting communicators", rank); /** * Initialize the Nessie interface by specifying a transport, encoding scheme, and a * recommended URL. \ref NSSI_DEFAULT_TRANSPORT is usually the best choice, since it * is often the case that only one type of transport exists on a particular platform. * Currently supported transports are \ref NSSI_RPC_PTL, \ref NSSI_RPC_GNI, and * \ref NSSI_RPC_IB. We only support one type of encoding scheme so NSSI_DEFAULT_ENCODE * should always be used for the second argument. The URL can be specified (as we did for * the server, or NULL (as we did for the client). This is a recommended value. Use the * \ref nssi_get_url function to find the actual value. */ nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, NULL); // Get the Server URL std::string my_url(NSSI_URL_LEN, '\0'); nssi_get_url((nssi_rpc_transport)args.transport, &my_url[0], NSSI_URL_LEN); // If running as both client and server, gather and distribute // the server URLs to all the clients. if (args.server_flag && args.client_flag) { std::string all_urls; // This needs to be a vector of chars, not a string all_urls.resize(args.num_servers * NSSI_URL_LEN, '\0'); // Have servers gather their URLs if (color == 0) { assert(args.num_servers == splitsize); // these should be equal log_debug(debug_level, "%d: Gathering urls: my_url=%s", rank, my_url.c_str()); // gather all urls to rank 0 of the server comm (also rank 0 of MPI_COMM_WORLD) MPI_Gather(&my_url[0], NSSI_URL_LEN, MPI_CHAR, &all_urls[0], NSSI_URL_LEN, MPI_CHAR, 0, comm); } // broadcast the full set of server urls to all processes MPI_Bcast(&all_urls[0], all_urls.size(), MPI_CHAR, 0, MPI_COMM_WORLD); log_debug(debug_level, "%d: Bcast urls, urls.size=%d", rank, all_urls.size()); if (color == 1) { // For block distribution scheme use the utility function (in xfer_util.cpp) if (args.block_distribution) { // Use this utility function to calculate the server_index xfer_block_partition(args.num_servers, splitsize, splitrank, &server_index, &rank_in_server); } // Use a simple round robin distribution scheme else { server_index = splitrank % args.num_servers; rank_in_server = splitrank / args.num_servers; } // Copy the server url out of the list of urls int offset = server_index * NSSI_URL_LEN; args.server_url = all_urls.substr(offset, NSSI_URL_LEN); log_debug(debug_level, "client %d assigned to server \"%s\"", splitrank, args.server_url.c_str()); } log_debug(debug_level, "%d: Finished distributing server urls, server_url=%s", rank, args.server_url.c_str()); } // If running as a client only, have to get the list of servers from the urlfile. else if (!args.server_flag && args.client_flag){ sleep(args.delay); // give server time to get started std::vector< std::string > urlbuf; xfer_read_server_url_file(args.url_file.c_str(), urlbuf, comm); args.num_servers = urlbuf.size(); // For block distribution scheme use the utility function (in xfer_util.cpp) if (args.block_distribution) { // Use this utility function to calculate the server_index xfer_block_partition(args.num_servers, splitsize, splitrank, &server_index, &rank_in_server); } // Use a simple round robin distribution scheme else { server_index = splitrank % args.num_servers; rank_in_server = splitrank / args.num_servers; } args.server_url = urlbuf[server_index]; log_debug(debug_level, "client %d assigned to server \"%s\"", splitrank, args.server_url.c_str()); } else if (args.server_flag && !args.client_flag) { args.server_url = my_url; if (args.url_file.empty()) { log_error(debug_level, "Must set --url-file"); MPI_Abort(MPI_COMM_WORLD, -1); } xfer_write_server_url_file(args.url_file.c_str(), my_url.c_str(), comm); } // Set the debug level for the xfer service. xfer_debug_level = args.debug_level; // Print the arguments after they've all been set. log_debug(debug_level, "%d: server_url=%s", rank, args.server_url.c_str()); print_args(out, args, "%"); log_debug(debug_level, "server_url=%s", args.server_url.c_str()); //------------------------------------------------------------------------------ /** If we're running this job with a server, the server always executes on node 0. * In this example, the server is a single process. */ if (color == 0) { rc = xfer_server_main((nssi_rpc_transport)args.transport, args.num_threads, comm); log_debug(debug_level, "Server is finished"); } // ------------------------------------------------------------------------------ /** The parallel client will execute this branch. The root node, node 0, of the client connects * connects with the server, using the \ref nssi_get_service function. Then the root * broadcasts the service description to the other clients before starting the main * loop of the client code by calling \ref xfer_client_main. */ else { int i; int client_rank; // get rank within the client communicator MPI_Comm_rank(comm, &client_rank); nssi_init((nssi_rpc_transport)args.transport); // Only one process needs to connect to the service // TODO: Make get_service a collective call (some transports do not need a connection) //if (client_rank == 0) { { // connect to remote server for (i=0; i < args.num_retries; i++) { log_debug(debug_level, "Try to connect to server: attempt #%d, url=%s", i, args.server_url.c_str()); rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url.c_str(), args.timeout, &xfer_svc); if (rc == NSSI_OK) break; else if (rc != NSSI_ETIMEDOUT) { log_error(xfer_debug_level, "could not get svc description: %s", nssi_err_str(rc)); break; } } } // wait for all the clients to connect MPI_Barrier(comm); //MPI_Bcast(&rc, 1, MPI_INT, 0, comm); if (rc == NSSI_OK) { if (client_rank == 0) log_debug(debug_level, "Connected to service on attempt %d\n", i); // Broadcast the service description to the other clients //log_debug(xfer_debug_level, "Bcasting svc to other clients"); //MPI_Bcast(&xfer_svc, sizeof(nssi_service), MPI_BYTE, 0, comm); log_debug(debug_level, "Starting client main"); // Start the client code xfer_client_main(args, xfer_svc, comm); MPI_Barrier(comm); // Tell one of the clients to kill the server if ((args.kill_server_flag) && (rank_in_server == 0)) { log_debug(debug_level, "%d: Halting xfer service", rank); rc = nssi_kill(&xfer_svc, 0, 5000); } rc=nssi_free_service((nssi_rpc_transport)args.transport, &xfer_svc); if (rc != NSSI_OK) { log_error(xfer_debug_level, "could not free svc description: %s", nssi_err_str(rc)); } } else { if (client_rank == 0) log_error(debug_level, "Failed to connect to service after %d attempts: ABORTING", i); success = false; //MPI_Abort(MPI_COMM_WORLD, -1); } nssi_fini((nssi_rpc_transport)args.transport); } log_debug(debug_level, "%d: clean up nssi", rank); MPI_Barrier(MPI_COMM_WORLD); // Clean up nssi_rpc rc = nssi_rpc_fini((nssi_rpc_transport)args.transport); if (rc != NSSI_OK) log_error(debug_level, "Error in nssi_rpc_fini"); log_debug(debug_level, "%d: MPI_Finalize()", rank); MPI_Finalize(); logger_fini(); if(success && (rc == NSSI_OK)) out << "\nEnd Result: TEST PASSED" << std::endl; else out << "\nEnd Result: TEST FAILED" << std::endl; return ((success && (rc==NSSI_OK)) ? 0 : 1 ); }
int main(int argc, char *argv[]) { bool success = true; bool verbose = false; try { const size_t num_sockets = Kokkos::hwloc::get_available_numa_count(); const size_t num_cores_per_socket = Kokkos::hwloc::get_available_cores_per_numa(); const size_t num_threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This test performance of MP::Vector multiply routines.\n"); int nGrid = 32; CLP.setOption("n", &nGrid, "Number of mesh points in the each direction"); int nIter = 10; CLP.setOption("ni", &nIter, "Number of multiply iterations"); #ifdef KOKKOS_HAVE_PTHREAD bool threads = true; CLP.setOption("threads", "no-threads", &threads, "Enable Threads device"); int num_cores = num_cores_per_socket * num_sockets; CLP.setOption("cores", &num_cores, "Number of CPU cores to use (defaults to all)"); int num_hyper_threads = num_threads_per_core; CLP.setOption("hyperthreads", &num_hyper_threads, "Number of hyper threads per core to use (defaults to all)"); int threads_per_vector = 1; CLP.setOption("threads_per_vector", &threads_per_vector, "Number of threads to use within each vector"); #endif #ifdef KOKKOS_HAVE_CUDA bool cuda = true; CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device"); int cuda_threads_per_vector = 16; CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector, "Number of Cuda threads to use within each vector"); int cuda_block_size = 0; CLP.setOption("cuda_block_size", &cuda_block_size, "Cuda block size (0 implies the default choice)"); int num_cuda_blocks = 0; CLP.setOption("num_cuda_blocks", &num_cuda_blocks, "Number of Cuda blocks (0 implies the default choice)"); int device_id = 0; CLP.setOption("device", &device_id, "CUDA device ID"); #endif CLP.parse( argc, argv ); typedef int Ordinal; typedef double Scalar; #ifdef KOKKOS_HAVE_PTHREAD if (threads) { typedef Kokkos::Threads Device; typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage; Kokkos::Threads::initialize(num_cores*num_hyper_threads); std::cout << std::endl << "Threads performance with " << num_cores*num_hyper_threads << " threads:" << std::endl; Kokkos::DeviceConfig dev_config(num_cores, threads_per_vector, num_hyper_threads / threads_per_vector); mainHost<Storage>(nGrid, nIter, dev_config); Kokkos::Threads::finalize(); } #endif #ifdef KOKKOS_HAVE_CUDA if (cuda) { typedef Kokkos::Cuda Device; typedef Stokhos::StaticFixedStorage<Ordinal,Scalar,1,Device> Storage; Kokkos::Cuda::host_mirror_device_type::initialize(); Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id)); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device_id); std::cout << std::endl << "CUDA performance for device " << device_id << " (" << deviceProp.name << "):" << std::endl; Kokkos::DeviceConfig dev_config( num_cuda_blocks, cuda_threads_per_vector, cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector); mainCuda<Storage>(nGrid,nIter,dev_config); Kokkos::Cuda::host_mirror_device_type::finalize(); Kokkos::Cuda::finalize(); } #endif } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program measure the performance of IChol algorithms on Kokkos::Threads execution space.\n"); int nthreads = 1; clp.setOption("nthreads", &nthreads, "Number of threads"); int max_task_dependence = 10; clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence"); int team_size = 1; clp.setOption("team-size", &team_size, "Team size"); bool team_interface = false; clp.setOption("enable-team-interface", "disable-team-interface", &team_interface, "Flag for team interface"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int niter = 10; clp.setOption("niter", &niter, "Number of iterations for testing"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { const bool overwrite = true; const int nshepherds = (team_interface ? nthreads/team_size : nthreads); const int nworker_per_shepherd = nthreads/nshepherds; setenv("QT_HWPAR", to_string(nthreads).c_str(), overwrite); setenv("QT_NUM_SHEPHERDS", to_string(nshepherds).c_str(), overwrite); setenv("QT_NUM_WORKERS_PER_SHEPHERD", to_string(nworker_per_shepherd).c_str(), overwrite); exec_space::initialize(nthreads); exec_space::print_configuration(cout, true); // r_val = exampleICholPerformance // <value_type,ordinal_type,size_type,exec_space,void> // (file_input, niter, nthreads, max_task_dependence, team_size, team_interface, (nthreads != 1), verbose); exec_space::finalize(); unsetenv("QT_HWPAR"); unsetenv("QT_NUM_SHEPHERDS"); unsetenv("QT_NUM_WORKERS_PER_SHEPHERD"); } return r_val; }
/// \brief Parse command-line options for this test /// /// \param argc [in] As usual in C(++) /// \param argv [in] As usual in C(++) /// \param allowedToPrint [in] Whether this (MPI) process is allowed /// to print to stdout/stderr. Different per (MPI) process. /// \param printedHelp [out] Whether this (MPI) process printed the /// "help" display (summary of command-line options) /// /// \return Encapsulation of command-line options static DistTsqrTestParameters parseOptions (int argc, char* argv[], const bool allowedToPrint, bool& printedHelp) { using std::cerr; using std::endl; printedHelp = false; // Command-line parameters, set to their default values. DistTsqrTestParameters params; try { Teuchos::CommandLineProcessor cmdLineProc (/* throwExceptions=*/ true, /* recognizeAllOptions=*/ true); cmdLineProc.setDocString (docString); cmdLineProc.setOption ("verify", "noverify", ¶ms.verify, "Test accuracy"); cmdLineProc.setOption ("benchmark", "nobenchmark", ¶ms.benchmark, "Test performance"); cmdLineProc.setOption ("implicit", "noimplicit", ¶ms.testFactorImplicit, "Test DistTsqr\'s factor() and explicit_Q()"); cmdLineProc.setOption ("explicit", "noexplicit", ¶ms.testFactorExplicit, "Test DistTsqr\'s factorExplicit()"); cmdLineProc.setOption ("print-matrices", "no-print-matrices", ¶ms.printMatrices, "Print global test matrices and computed results to stderr"); cmdLineProc.setOption ("debug", "nodebug", ¶ms.debug, "Print debugging information"); cmdLineProc.setOption ("human-readable", "machine-readable", ¶ms.humanReadable, "If set, make output easy to read by humans " "(but hard to parse)"); cmdLineProc.setOption ("ncols", ¶ms.numCols, "Number of columns in the test matrix"); cmdLineProc.setOption ("ntrials", ¶ms.numTrials, "Number of trials (only used when \"--benchmark\""); cmdLineProc.setOption ("real", "noreal", ¶ms.testReal, "Test real arithmetic routines"); #ifdef HAVE_TSQR_COMPLEX cmdLineProc.setOption ("complex", "nocomplex", ¶ms.testComplex, "Test complex arithmetic routines"); #endif // HAVE_TSQR_COMPLEX cmdLineProc.parse (argc, argv); } catch (Teuchos::CommandLineProcessor::UnrecognizedOption& e) { if (allowedToPrint) cerr << "Unrecognized command-line option: " << e.what() << endl; throw e; } catch (Teuchos::CommandLineProcessor::HelpPrinted& e) { printedHelp = true; } // Validate command-line options. We provide default values // for unset options, so we don't have to validate those. if (params.numCols <= 0) throw std::invalid_argument ("Number of columns must be positive"); else if (params.benchmark && params.numTrials < 1) throw std::invalid_argument ("\"--benchmark\" option requires numTrials >= 1"); return params; }
int main(int argc, char **argv) { try { // Initialize MPI #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example generates the sparsity pattern for the block stochastic Galerkin matrix.\n"); int d = 3; CLP.setOption("dimension", &d, "Stochastic dimension"); int p = 5; CLP.setOption("order", &p, "Polynomial order"); double drop = 1.0e-15; CLP.setOption("drop", &drop, "Drop tolerance"); std::string file = "A.mm"; CLP.setOption("filename", &file, "Matrix Market filename"); BasisType basis_type = LEGENDRE; CLP.setOption("basis", &basis_type, num_basis_types, basis_type_values, basis_type_names, "Basis type"); bool full = true; CLP.setOption("full", "linear", &full, "Use full or linear expansion"); bool use_old = false; CLP.setOption("old", "new", &use_old, "Use old or new Cijk algorithm"); // Parse arguments CLP.parse( argc, argv ); // Basis Teuchos::Array< Teuchos::RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); for (int i=0; i<d; i++) { if (basis_type == HERMITE) bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>(p)); else if (basis_type == LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>(p)); else if (basis_type == RYS) bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>(p, 1.0, false)); } Teuchos::RCP<const Stokhos::CompletePolynomialBasis<int,double> > basis = Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>(bases, drop, use_old)); // Triple product tensor Teuchos::RCP<Stokhos::Sparse3Tensor<int,double> > Cijk; if (full) Cijk = basis->computeTripleProductTensor(basis->size()); else Cijk = basis->computeTripleProductTensor(basis->dimension()+1); std::cout << "basis size = " << basis->size() << " num nonzero Cijk entries = " << Cijk->num_entries() << std::endl; #ifdef HAVE_MPI Epetra_MpiComm comm(MPI_COMM_WORLD); #else Epetra_SerialComm comm; #endif // Print triple product sparsity to matrix market file Stokhos::sparse3Tensor2MatrixMarket(*basis, *Cijk, comm, file); Teuchos::TimeMonitor::summarize(std::cout); } catch (std::exception& e) { std::cout << e.what() << std::endl; } return 0; }
int main (int argc, char *argv[]) { // command-line arguments log_level debug_level = LOG_ERROR; string logfile(""); int npes, me, i; int num_servers=1; int num_clients=1; int servers_per_node=1; int clients_per_node=1; int client_weight=10; int server_weight=10; int client_server_weight=5; string server_node_file("SNF.txt"); string client_node_file("CNF.txt"); const int num_graphs = 4; const int graph_vals[] = { GRAPH_COMPLETE, GRAPH_CLIENT_COMPLETE, GRAPH_SERVER_COMPLETE, GRAPH_CLIENT_SERVER_ONLY }; const char * graph_names[] = { "complete", "client-complete", "server-complete", "client-server-only" }; enum graph_connection_t graph_connection=GRAPH_COMPLETE; MPI_Init(&argc, &argv); try { Teuchos::CommandLineProcessor parser; // init parser parser.setDocString("Find node placement of server and client ranks"); parser.setOption("strategy", &strategy, "LibTopoMap strategy (greedy, greedy_route, recursive, rcm, scotch, ascending)"); parser.setOption("num-servers", (int *)(&num_servers), "Number of servers to place"); parser.setOption("num-clients", (int *)(&num_clients), "Number of clients to place"); parser.setOption("servers-per-node", (int *)(&servers_per_node), "Number of server ranks per compute node"); parser.setOption("clients-per-node", (int *)(&clients_per_node), "Number of client ranks per compute node"); parser.setOption("server-weight", (int *)(&server_weight), "Edge weight of server-to-server communication"); parser.setOption("client-weight", (int *)(&client_weight), "Edge weight of client-to-client communication"); parser.setOption("client-server-weight", (int *)(&client_server_weight), "Edge weight of client-to-server communication"); parser.setOption("server-node-file", &server_node_file, "Where to write the server placement results"); parser.setOption("client-node-file", &client_node_file, "Where to write the client placement results"); parser.setOption("verbose", (int *)(&debug_level), "Debug level"); parser.setOption("logfile", &logfile, "Path to file for debug statements"); // Set an enumeration command line option for the connection graph parser.setOption("graph-connection", (int*)&graph_connection, num_graphs, graph_vals, graph_names, "Graph Connections for the example: \n" "\t\t\tcomplete : client-client graph is complete, server-server graph is complete\n" "\t\t\tclient-complete: client-client graph is complete, server-server graph is empty\n" "\t\t\tserver-complete : client-client graph is empty, server-server graph is complete\n" "\t\t\tclient-server-only: client-client graph is empty, server-server graph is empty\n" "\t\t\tIn all cases, each client has an edge to one of the servers\n" ); parser.recogniseAllOptions(); parser.throwExceptions(); Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv ); if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) { return 0; } if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) { return 1; // Error! } } catch (...) { exit(-1); } /* initialize the logger */ logger_init(debug_level, logfile.c_str()); MPI_Comm_size(MPI_COMM_WORLD, &npes); MPI_Comm_rank(MPI_COMM_WORLD, &me); if (me==0) { cout << " ---------------- ARGUMENTS --------------- " << std::endl; cout << " \tstrategy = " << strategy << std::endl; cout << " \tgraph-connection = " << graph_names[graph_connection] << std::endl; cout << " \tnum-servers = " << num_servers << std::endl; cout << " \tnum-clients = " << num_clients << std::endl; cout << " \tservers-per-node = " << servers_per_node << std::endl; cout << " \tclients-per-node = " << clients_per_node << std::endl; cout << " \tserver-weight = " << server_weight << std::endl; cout << " \tclient-weight = " << client_weight << std::endl; cout << " \tclient-server-weight = " << client_server_weight << std::endl; cout << " \tserver-node-file = " << server_node_file << std::endl; cout << " \tclient-node-file = " << client_node_file << std::endl; cout << " \tverbose = " << debug_level << std::endl; cout << " \tlogfile = " << logfile << std::endl; cout << " ------------------------------------------- " << std::endl; } MPI_Barrier(MPI_COMM_WORLD); int *rank_map=(int*)malloc(sizeof(int) * npes); int *nid_map=(int*)malloc(sizeof(int) * npes); construct_graph( rank_map, nid_map, num_servers, num_clients, servers_per_node, clients_per_node, server_weight, client_weight, client_server_weight, graph_connection, 0); if (me == 0) { ofstream snf(server_node_file.c_str(), ios_base::out); ofstream cnf(client_node_file.c_str(), ios_base::out); for (i=0;i<npes;i++) { if (rank_map[i] < num_servers) snf << nid_map[i] << "\t" << i << "\t" << rank_map[i] << std::endl; } for (i=0;i<npes;i++) { if (rank_map[i] >= num_servers) cnf << nid_map[i] << "\t" << i << "\t" << rank_map[i] << std::endl; } snf.close(); cnf.close(); } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example generates partitions the Cijk tensor for a lexicographic tree basis.\n"); int d = 3; CLP.setOption("dimension", &d, "Stochastic dimension"); int p = 5; CLP.setOption("order", &p, "Polynomial order"); double drop = 1.0e-12; CLP.setOption("drop", &drop, "Drop tolerance"); bool symmetric = true; CLP.setOption("symmetric", "asymmetric", &symmetric, "Use basis polynomials with symmetric PDF"); int level = 1; CLP.setOption("level", &level, "Level to partition"); bool save_3tensor = false; CLP.setOption("save_3tensor", "no-save_3tensor", &save_3tensor, "Save full 3tensor to file"); std::string file_3tensor = "Cijk.dat"; CLP.setOption("filename_3tensor", &file_3tensor, "Filename to store full 3-tensor"); // Parse arguments CLP.parse( argc, argv ); // Basis Array< RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); const double alpha = 1.0; const double beta = symmetric ? 1.0 : 2.0 ; for (int i=0; i<d; i++) { bases[i] = Teuchos::rcp(new Stokhos::JacobiBasis<int,double>( p, alpha, beta, true)); } typedef Stokhos::LexographicLess< Stokhos::MultiIndex<int> > less_type; typedef Stokhos::TotalOrderBasis<int,double,less_type> basis_type; RCP<const basis_type> basis = Teuchos::rcp(new basis_type(bases, drop)); // Build LTB Cijk typedef Stokhos::LTBSparse3Tensor<int,double> Cijk_LTB_type; typedef Cijk_LTB_type::CijkNode node_type; Teuchos::RCP<Cijk_LTB_type> Cijk = computeTripleProductTensorLTB(*basis, symmetric); int sz = basis->size(); std::cout << "basis size = " << sz << " num nonzero Cijk entries = " << Cijk->num_entries() << std::endl; // Setup partitions Teuchos::Array< Teuchos::RCP<const node_type> > node_stack; Teuchos::Array< int > index_stack; node_stack.push_back(Cijk->getHeadNode()); index_stack.push_back(0); Teuchos::RCP<const node_type> node; int child_index; Teuchos::Array< Teuchos::RCP<const node_type> > partition_stack; int my_level = 0; while (node_stack.size() > 0) { node = node_stack.back(); child_index = index_stack.back(); // Leaf -- If we got here, just push this node into the partitions if (node->is_leaf) { partition_stack.push_back(node); node_stack.pop_back(); index_stack.pop_back(); --my_level; } // Put nodes into partition if level matches else if (my_level == level) { partition_stack.push_back(node); node_stack.pop_back(); index_stack.pop_back(); --my_level; } // More children to process -- process them first else if (child_index < node->children.size()) { ++index_stack.back(); node = node->children[child_index]; node_stack.push_back(node); index_stack.push_back(0); ++my_level; } // No more children else { node_stack.pop_back(); index_stack.pop_back(); --my_level; } } // Print statistics int max_i_size = 0, max_j_size = 0, max_k_size = 0; for (int part=0; part<partition_stack.size(); ++part) { node = partition_stack[part]; if (node->i_size > max_i_size) max_i_size = node->i_size; if (node->j_size > max_j_size) max_j_size = node->j_size; if (node->k_size > max_k_size) max_k_size = node->k_size; } std::cout << "num partitions = " << partition_stack.size() << std::endl << "max i size = " << max_i_size << std::endl << "max j size = " << max_j_size << std::endl << "max k size = " << max_k_size << std::endl; // Build flat list of (i,j,k,part) tuples typedef Stokhos::ProductBasisUtils::Cijk_1D_Iterator<int> Cijk_Iterator; Teuchos::Array< Teuchos::Array<int> > tuples; for (int part=0; part<partition_stack.size(); ++part) { node = partition_stack[part]; node_stack.push_back(node); index_stack.push_back(0); while (node_stack.size() > 0) { node = node_stack.back(); child_index = index_stack.back(); // Leaf -- store (i,j,k,part) tuples if (node->is_leaf) { Cijk_Iterator cijk_iterator(node->p_i, node->p_j, node->p_k, symmetric); bool more = true; while (more) { Teuchos::Array<int> t(4); int I = node->i_begin + cijk_iterator.i; int J = node->j_begin + cijk_iterator.j; int K = node->k_begin + cijk_iterator.k; t[0] = I; t[1] = J; t[2] = K; t[3] = part; tuples.push_back(t); more = cijk_iterator.increment(); } node_stack.pop_back(); index_stack.pop_back(); } // More children to process -- process them first else if (child_index < node->children.size()) { ++index_stack.back(); node = node->children[child_index]; node_stack.push_back(node); index_stack.push_back(0); } // No more children else { node_stack.pop_back(); index_stack.pop_back(); } } } // Print full 3-tensor to file if (save_3tensor) { std::ofstream cijk_file(file_3tensor.c_str()); cijk_file.precision(14); cijk_file.setf(std::ios::scientific); cijk_file << "i, j, k, part" << std::endl; for (int i=0; i<tuples.size(); ++i) { cijk_file << tuples[i][0] << ", " << tuples[i][1] << ", " << tuples[i][2] << ", " << tuples[i][3] << std::endl; } cijk_file.close(); } } catch (std::exception& e) { std::cout << e.what() << std::endl; } return 0; }
int main (int argc, char *argv[]) { int rc; // command-line arguments int retries = 0; int sig = 0; int timeout = 1000; log_level debug_level = LOG_ERROR; string logfile(""); nssi_service svc; char my_url[NSSI_URL_LEN]; std::string server_url(""); char server_str[NSSI_URL_LEN]; std::string contact_file(""); /* the file where the server's url should be written */ try { Teuchos::CommandLineProcessor parser; // init parser parser.setDocString("Kill an NSSI server"); parser.setOption("verbose", (int *)(&debug_level), "Debug level."); parser.setOption("logfile", &logfile, "Path to file for debug statements"); parser.setOption("server-url", &server_url, "URL of NSSI service"); parser.setOption("contact-file", &contact_file, "Where to read the server's URL"); parser.setOption("timeout", &timeout, "Timout for contacting services (ms)"); parser.setOption("retries", &retries, "Number of times to retry before exiting"); parser.setOption("sig", &sig, "Signal to use for the kill command"); parser.recogniseAllOptions(); parser.throwExceptions(); Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv ); if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) { return 0; } if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) { return 1; // Error! } } catch (...) { exit(-1); } /* initialize the logger */ logger_init(debug_level, logfile.c_str()); if (server_url.c_str()[0]=='\0') { sleep(1); log_debug(debug_level, "reading URL from file"); read_contact_info(contact_file.c_str(), server_str, NSSI_URL_LEN); } else { log_debug(debug_level, "using URL from command-line"); strcpy(server_str, server_url.c_str()); } nssi_rpc_init(NSSI_DEFAULT_TRANSPORT, NSSI_DEFAULT_ENCODE, NULL); nssi_get_url(NSSI_DEFAULT_TRANSPORT, my_url, NSSI_URL_LEN); // sleep(1); log_info(debug_level, "\nTrying to get service at %s", server_str); rc=nssi_get_service(NSSI_DEFAULT_TRANSPORT, server_str, timeout, &svc); if (rc != NSSI_OK) { log_error(admin_debug_level, "could not get svc description: %s", nssi_err_str(rc)); return rc; } rc = kill_svc(&svc, sig, timeout); if (rc == NSSI_ETIMEDOUT) { fprintf(stderr, "Timed out trying to kill (%s)\n", server_url.c_str()); return rc; } else if (rc != NSSI_OK) { log_error(admin_debug_level, "failed to kill service: %s", nssi_err_str(rc)); return rc; } nssi_rpc_fini(NSSI_DEFAULT_TRANSPORT); return 0; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("Tacho::DenseMatrixBase examples on Pthreads execution space.\n"); int nthreads = 0; clp.setOption("nthreads", &nthreads, "Number of threads"); // int numa = 0; // clp.setOption("numa", &numa, "Number of numa node"); // int core_per_numa = 0; // clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); std::string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int treecut = 0; clp.setOption("treecut", &treecut, "Level to cut tree from bottom"); int prunecut = 0; clp.setOption("prunecut", &prunecut, "Level to prune tree from bottom"); int fill_level = -1; clp.setOption("fill-level", &fill_level, "Fill level"); int rows_per_team = 4096; clp.setOption("rows-per-team", &rows_per_team, "Workset size"); int max_concurrency = 250000; clp.setOption("max-concurrency", &max_concurrency, "Max number of concurrent tasks"); int max_task_dependence = 3; clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence"); int team_size = 1; clp.setOption("team-size", &team_size, "Team size"); int nrhs = 1; clp.setOption("nrhs", &team_size, "# of right hand side"); int mb = 0; clp.setOption("mb", &mb, "Dense nested blocks size"); int nb = 1; clp.setOption("nb", &nb, "Column block size of right hand side"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(nthreads); #if (defined(HAVE_SHYLUTACHO_SCOTCH) && defined(HAVE_SHYLUTACHO_CHOLMOD)) r_val = exampleCholSuperNodesByBlocks<exec_space> (file_input, treecut, prunecut, fill_level, rows_per_team, max_concurrency, max_task_dependence, team_size, nrhs, mb, nb, verbose); #else r_val = -1; std::cout << "Scotch or Cholmod is NOT configured in Trilinos" << std::endl; #endif exec_space::finalize(); } return r_val; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program show blockwise information on Kokkos::Serial execution space.\n"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int fill_level = 0; clp.setOption("fill-level", &fill_level, "Fill level"); int league_size = 1; clp.setOption("league-size", &league_size, "League size"); int treecut = 15; clp.setOption("treecut", &treecut, "Level to cut tree from bottom"); int minblksize = 0; clp.setOption("minblksize", &minblksize, "Minimum block size for internal reordering"); int prunecut = 0; clp.setOption("prunecut", &prunecut, "Level to prune the tree from bottom"); int seed = 0; clp.setOption("seed", &seed, "Seed for random number generator in graph partition"); int histogram_size = 0; clp.setOption("histogram-size", &histogram_size, "Histogram size"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { Kokkos::initialize(); r_val = exampleStatByBlocks <value_type,ordinal_type,size_type,exec_space,void> (file_input, treecut, minblksize, prunecut, seed, fill_level, league_size, histogram_size, verbose); Kokkos::finalize(); } return r_val; }
int main(int argc, char *argv[]) { typedef int IndexType; typedef double ValueType; typedef cusp::device_memory MemorySpace; //typedef cusp::row_major Orientation; bool success = true; bool verbose = false; try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString("This test performance of block multiply routines.\n"); IndexType n = 32; CLP.setOption("n", &n, "Number of mesh points in the each direction"); IndexType nrhs_begin = 32; CLP.setOption("begin", &nrhs_begin, "Staring number of right-hand-sides"); IndexType nrhs_end = 512; CLP.setOption("end", &nrhs_end, "Ending number of right-hand-sides"); IndexType nrhs_step = 32; CLP.setOption("step", &nrhs_step, "Increment in number of right-hand-sides"); IndexType nits = 10; CLP.setOption("nits", &nits, "Number of multiply iterations"); int device_id = 0; CLP.setOption("device", &device_id, "CUDA device ID"); CLP.parse( argc, argv ); // Set CUDA device cudaSetDevice(device_id); cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); // create 3D Poisson problem cusp::csr_matrix<IndexType, ValueType, MemorySpace> A; cusp::gallery::poisson27pt(A, n, n, n); std::cout << "nrhs , num_rows , num_entries , row_time , row_gflops , " << "col_time , col_gflops" << std::endl; for (IndexType nrhs = nrhs_begin; nrhs <= nrhs_end; nrhs += nrhs_step) { double flops = 2.0 * static_cast<double>(A.num_entries) * static_cast<double>(nrhs); // test row-major storage cusp::array2d<ValueType, MemorySpace, cusp::row_major> x_row( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::row_major> y_row( A.num_rows, nrhs, 0); cusp::detail::timer row_timer; row_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_row, y_row); } cudaDeviceSynchronize(); double row_time = row_timer.seconds_elapsed() / nits; double row_gflops = 1.0e-9 * flops / row_time; // test column-major storage cusp::array2d<ValueType, MemorySpace, cusp::column_major> x_col( A.num_rows, nrhs, 1); cusp::array2d<ValueType, MemorySpace, cusp::column_major> y_col( A.num_rows, nrhs, 0); cusp::detail::timer col_timer; col_timer.start(); for (IndexType iter=0; iter<nits; ++iter) { cusp::MVmultiply(A, x_col, y_col); } cudaDeviceSynchronize(); double col_time = col_timer.seconds_elapsed() / nits; double col_gflops = 1.0e-9 * flops / col_time; std::cout << nrhs << " , " << A.num_rows << " , " << A.num_entries << " , " << row_time << " , " << row_gflops << " , " << col_time << " , " << col_gflops << std::endl; } } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }
int main(int argc, char *argv[]) { int np=1, rank=0; int splitrank, splitsize; int rc = 0; nssi_service multicast_svc[2]; int transport_index=-1; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &np); MPI_Barrier(MPI_COMM_WORLD); Teuchos::oblackholestream blackhole; std::ostream &out = ( rank == 0 ? std::cout : blackhole ); struct multicast_args args; const int num_io_methods = 6; const int io_method_vals[] = { MULTICAST_EMPTY_REQUEST_SYNC, MULTICAST_EMPTY_REQUEST_ASYNC, MULTICAST_GET_SYNC, MULTICAST_GET_ASYNC, MULTICAST_PUT_SYNC, MULTICAST_PUT_ASYNC}; const char * io_method_names[] = { "empty-request-sync", "empty-request-async", "get-sync", "get-async", "put-sync", "put-async"}; const int nssi_transport_list[] = { NSSI_RPC_PTL, NSSI_RPC_PTL, NSSI_RPC_IB, NSSI_RPC_IB, NSSI_RPC_GEMINI, NSSI_RPC_GEMINI, NSSI_RPC_BGPDCMF, NSSI_RPC_BGPDCMF, NSSI_RPC_BGQPAMI, NSSI_RPC_BGQPAMI, NSSI_RPC_MPI}; const int num_nssi_transports = 11; const int nssi_transport_vals[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; const char * nssi_transport_names[] = { "portals", "ptl", "infiniband", "ib", "gemini", "gni", "bgpdcmf", "dcmf", "bgqpami", "pami", "mpi" }; // Initialize arguments args.transport=NSSI_DEFAULT_TRANSPORT; args.delay = 1; args.io_method = MULTICAST_EMPTY_REQUEST_SYNC; args.debug_level = LOG_WARN; args.num_trials = 1; args.num_reqs = 1; args.len = 1; args.result_file_mode = "a"; args.result_file = ""; args.url_file[0] = ""; args.url_file[1] = ""; args.logfile = ""; args.client_flag = true; args.server_flag = true; args.timeout = 500; args.num_retries = 5; args.validate_flag = true; args.server_url[0] = ""; args.server_url[1] = ""; bool success = true; /** * We make extensive use of the \ref Teuchos::CommandLineProcessor for command-line * options to control the behavior of the test code. To evaluate performance, * the "num-trials", "num-reqs", and "len" options control the amount of data transferred * between client and server. The "io-method" selects the type of data transfer. The * server-url specifies the URL of the server. If running as a server, the server-url * provides a recommended URL when initializing the network transport. */ try { //out << Teuchos::Teuchos_Version() << std::endl << std::endl; // Creating an empty command line processor looks like: Teuchos::CommandLineProcessor parser; parser.setDocString( "This example program demonstrates a simple data-transfer service " "built using the NEtwork Scalable Service Interface (Nessie)." ); /* To set and option, it must be given a name and default value. Additionally, each option can be given a help std::string. Although it is not necessary, a help std::string aids a users comprehension of the acceptable command line arguments. Some examples of setting command line options are: */ parser.setOption("delay", &args.delay, "time(s) for client to wait for server to start" ); parser.setOption("timeout", &args.timeout, "time(ms) to wait for server to respond" ); parser.setOption("server", "no-server", &args.server_flag, "Run the server" ); parser.setOption("client", "no-client", &args.client_flag, "Run the client"); parser.setOption("len", &args.len, "The number of structures in an input buffer"); parser.setOption("debug",(int*)(&args.debug_level), "Debug level"); parser.setOption("logfile", &args.logfile, "log file"); parser.setOption("num-trials", &args.num_trials, "Number of trials (experiments)"); parser.setOption("num-reqs", &args.num_reqs, "Number of reqs/trial"); parser.setOption("result-file", &args.result_file, "Where to store results"); parser.setOption("result-file-mode", &args.result_file_mode, "Write mode for the result"); parser.setOption("server-url-1", &args.server_url[0], "URL client uses to find the server 1"); parser.setOption("server-url-2", &args.server_url[1], "URL client uses to find the server 2"); parser.setOption("server-url-file-1", &args.url_file[0], "File that has URL client uses to find server 1"); parser.setOption("server-url-file-2", &args.url_file[1], "File that has URL client uses to find server 2"); parser.setOption("validate", "no-validate", &args.validate_flag, "Validate the data"); // Set an enumeration command line option for the io_method parser.setOption("io-method", &args.io_method, num_io_methods, io_method_vals, io_method_names, "I/O Methods for the example: \n" "\t\t\tempty-request-sync : Send an empty request - synchronous\n" "\t\t\tempty-request-async: Send an empty request - asynchronous\n" "\t\t\tget-sync : Servers pull data from client - synchronous\n" "\t\t\tget-async: Servers pull data from client - asynchronous\n" "\t\t\tput-sync : Servers push data from client - synchronous\n" "\t\t\tput-async: Servers push data from client - asynchronous" ); // Set an enumeration command line option for the NNTI transport parser.setOption("transport", &transport_index, num_nssi_transports, nssi_transport_vals, nssi_transport_names, "NSSI transports (not all are available on every platform): \n" "\t\t\tportals|ptl : Cray or Schutt\n" "\t\t\tinfiniband|ib : libibverbs\n" "\t\t\tgemini|gni : Cray libugni (Gemini or Aries)\n" "\t\t\tbgpdcmf|dcmf : IBM BG/P DCMF\n" "\t\t\tbgqpami|pami : IBM BG/Q PAMI\n" "\t\t\tmpi : isend/irecv implementation\n" ); /* There are also two methods that control the behavior of the command line processor. First, for the command line processor to allow an unrecognized a command line option to be ignored (and only have a warning printed), use: */ parser.recogniseAllOptions(true); /* Second, by default, if the parser finds a command line option it doesn't recognize or finds the --help option, it will throw an std::exception. If you want prevent a command line processor from throwing an std::exception (which is important in this program since we don't have an try/catch around this) when it encounters a unrecognized option or help is printed, use: */ parser.throwExceptions(false); /* We now parse the command line where argc and argv are passed to the parse method. Note that since we have turned off std::exception throwing above we had better grab the return argument so that we can see what happened and act accordingly. */ Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= parser.parse( argc, argv ); if( parseReturn == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED ) { return 0; } if( parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) { return 1; // Error! } // Here is where you would use these command line arguments but for this example program // we will just print the help message with the new values of the command-line arguments. //if (rank == 0) // out << "\nPrinting help message with new values of command-line arguments ...\n\n"; //parser.printHelpMessage(argv[0],out); } TEUCHOS_STANDARD_CATCH_STATEMENTS(true,std::cerr,success); log_debug(LOG_ALL, "transport_index=%d", transport_index); if (transport_index > -1) { args.transport =nssi_transport_list[transport_index]; args.transport_name=std::string(nssi_transport_names[transport_index]); } args.io_method_name=io_method_names[args.io_method]; log_debug(args.debug_level, "%d: Finished processing arguments", rank); if (!success) { MPI_Abort(MPI_COMM_WORLD, 1); } if (!args.server_flag && args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.client.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } else if (args.server_flag && !args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.server.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } else if (args.server_flag && args.client_flag) { /* initialize logger */ if (args.logfile.empty()) { logger_init(args.debug_level, NULL); } else { char fn[1024]; sprintf(fn, "%s.%03d.log", args.logfile.c_str(), rank); logger_init(args.debug_level, fn); } } log_level debug_level = args.debug_level; // Communicator used for both client and server (may split if using client and server) MPI_Comm comm; log_debug(debug_level, "%d: Starting multicast-service test", rank); /** * Since this test can be run as a server, client, or both, we need to play some fancy * MPI games to get the communicators working correctly. If we're executing as both * a client and a server, we split the communicator so that the client thinks its * running by itself. */ if (args.client_flag && args.server_flag) { if (np < 3) { log_error(debug_level, "Must use at least 3 MPI processes for client and server mode"); MPI_Abort(MPI_COMM_WORLD, -1); } // Split the communicators. Processors with color=0 are servers. int color = ((rank == 0)||(rank == 1)) ? 0 : 1; // two server MPI_Comm_split(MPI_COMM_WORLD, color, rank, &comm); MPI_Comm_rank(comm, &splitrank); MPI_Comm_size(comm, &splitsize); // std::cout << "rank=" << rank << "/" << np << ", color=" << color << // ", new_rank=" << newrank << "/" << newsize << std::endl << std::endl; // // std::cout << "my_url=" << my_url << ", server_url=" << args.server_url << std::endl; } else { MPI_Comm_dup(MPI_COMM_WORLD, &comm); } /** * Initialize the Nessie interface by specifying a transport, encoding scheme, and a * recommended URL. \ref NSSI_DEFAULT_TRANSPORT is usually the best choice, since it * is often the case that only one type of transport exists on a particular platform. * Currently supported transports are \ref NSSI_RPC_PTL, \ref NSSI_RPC_GNI, and * \ref NSSI_RPC_IB. We only support one type of encoding scheme so NSSI_DEFAULT_ENCODE * should always be used for the second argument. The URL can be specified (as we did for * the server, or NULL (as we did for the client). This is a recommended value. Use the * \ref nssi_get_url function to find the actual value. */ if (args.server_flag && !args.server_url[rank].empty()) { // use the server URL as suggested URL nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, args.server_url[rank].c_str()); } else { nssi_rpc_init((nssi_rpc_transport)args.transport, NSSI_DEFAULT_ENCODE, NULL); } // Get the Server URL std::string my_url(NSSI_URL_LEN, '\0'); nssi_get_url((nssi_rpc_transport)args.transport, &my_url[0], NSSI_URL_LEN); // Broadcast the server URL to all the clients args.server_url[0].resize(NSSI_URL_LEN, '\0'); args.server_url[1].resize(NSSI_URL_LEN, '\0'); if (args.server_flag && args.client_flag) { args.server_url[0] = my_url; MPI_Bcast(&args.server_url[0][0], args.server_url[0].size(), MPI_CHAR, 0, MPI_COMM_WORLD); args.server_url[1] = my_url; MPI_Bcast(&args.server_url[1][0], args.server_url[1].size(), MPI_CHAR, 1, MPI_COMM_WORLD); } else if (!args.server_flag && args.client_flag){ if (args.server_url[0].empty()) { // check to see if we're supposed to get the URL from a file if (!args.url_file[0].empty()) { // Fetch the server URL from a file sleep(1); log_debug(debug_level, "Reading from file %s", args.url_file[0].c_str()); std::ifstream urlfile (args.url_file[0].c_str()); if (urlfile.is_open()) { if (urlfile.good()) getline(urlfile, args.server_url[0]); } else { log_error(debug_level, "Failed to open server_url_file=%s", args.url_file[0].c_str()); exit(1); } urlfile.close(); log_debug(debug_level, "URL = %s", args.server_url[0].c_str()); } else { log_error(debug_level, "Need to set --server-url-1=[ADDR] or --server-url-file-1=[PATH]"); } } if (args.server_url[1].empty()) { // check to see if we're supposed to get the URL from a file if (!args.url_file[1].empty()) { // Fetch the server URL from a file sleep(1); log_debug(debug_level, "Reading from file %s", args.url_file[1].c_str()); std::ifstream urlfile (args.url_file[1].c_str()); if (urlfile.is_open()) { if (urlfile.good()) getline(urlfile, args.server_url[1]); } else { log_error(debug_level, "Failed to open server_url_file=%s", args.url_file[1].c_str()); exit(1); } urlfile.close(); log_debug(debug_level, "URL = %s", args.server_url[1].c_str()); } else { log_error(debug_level, "Need to set --server-url-1=[ADDR] or --server-url-file-1=[PATH]"); } } } else if (args.server_flag && !args.client_flag) { args.server_url[0] = my_url; // If the url_file value is set, write the url to a file if (!args.url_file[0].empty()) { std::ofstream urlfile (args.url_file[0].c_str()); if (urlfile.is_open()) { urlfile << args.server_url[0].c_str() << std::endl; } urlfile.close(); log_debug(debug_level, "Wrote url to file %s", args.url_file[0].c_str()); } args.server_url[1] = my_url; // If the url_file value is set, write the url to a file if (!args.url_file[1].empty()) { std::ofstream urlfile (args.url_file[1].c_str()); if (urlfile.is_open()) { urlfile << args.server_url[1].c_str() << std::endl; } urlfile.close(); log_debug(debug_level, "Wrote url to file %s", args.url_file[1].c_str()); } } // Set the debug level for the multicast service. multicast_debug_level = args.debug_level; // Print the arguments after they've all been set. print_args(out, args, "%"); //------------------------------------------------------------------------------ /** If we're running this job with a server, the server always executes on nodes 0 and 1. * In this example, the server is two process. */ if (args.server_flag && ((rank == 0)|(rank == 1))) { rc = multicast_server_main(args, comm); log_debug(debug_level, "Server is finished"); } // ------------------------------------------------------------------------------ /** The parallel client will execute this branch. The root node, nodes 0 and 1, of the client connects * connects with the server, using the \ref nssi_get_service function. Then the root * broadcasts the service description to the other clients before starting the main * loop of the client code by calling \ref multicast_client_main. */ else { int i; int client_rank; // get rank within the client communicator MPI_Comm_rank(comm, &client_rank); nssi_init((nssi_rpc_transport)args.transport); // Only one process needs to connect to the service // TODO: Make get_service a collective call (some transports do not need a connection) //if (client_rank == 0) { { sleep(args.delay); // give server time to get started // connect to remote server for (i=0; i < args.num_retries; i++) { log_debug(debug_level, "Try to connect to server: attempt #%d", i); rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url[0].c_str(), args.timeout, &multicast_svc[0]); if (rc == NSSI_OK) break; else if (rc != NSSI_ETIMEDOUT) { log_error(multicast_debug_level, "could not get svc description: %s", nssi_err_str(rc)); break; } } // connect to remote server for (i=0; i < args.num_retries; i++) { log_debug(debug_level, "Try to connect to server: attempt #%d", i); rc=nssi_get_service((nssi_rpc_transport)args.transport, args.server_url[1].c_str(), args.timeout, &multicast_svc[1]); if (rc == NSSI_OK) break; else if (rc != NSSI_ETIMEDOUT) { log_error(multicast_debug_level, "could not get svc description: %s", nssi_err_str(rc)); break; } } } //MPI_Bcast(&rc, 1, MPI_INT, 0, comm); if (rc == NSSI_OK) { if (client_rank == 0) log_debug(debug_level, "Connected to service on attempt %d\n", i); // Broadcast the service description to the other clients //log_debug(multicast_debug_level, "Bcasting svc to other clients"); //MPI_Bcast(&multicast_svc, sizeof(nssi_service), MPI_BYTE, 0, comm); log_debug(debug_level, "Starting client main"); // Start the client code multicast_client_main(args, &multicast_svc[0], comm); MPI_Barrier(comm); // Tell one of the clients to kill the server if (client_rank == 0) { log_debug(debug_level, "%d: Halting multicast service", rank); rc = nssi_kill(&multicast_svc[0], 0, 5000); rc = nssi_kill(&multicast_svc[1], 0, 5000); } } else { if (client_rank == 0) log_error(debug_level, "Failed to connect to service after %d attempts: ABORTING", i); success = false; //MPI_Abort(MPI_COMM_WORLD, -1); } nssi_fini((nssi_rpc_transport)args.transport); } log_debug(debug_level, "%d: clean up nssi", rank); MPI_Barrier(MPI_COMM_WORLD); // Clean up nssi_rpc rc = nssi_rpc_fini((nssi_rpc_transport)args.transport); if (rc != NSSI_OK) log_error(debug_level, "Error in nssi_rpc_fini"); log_debug(debug_level, "%d: MPI_Finalize()", rank); MPI_Finalize(); logger_fini(); if(success && (rc == NSSI_OK)) out << "\nEnd Result: TEST PASSED" << std::endl; else out << "\nEnd Result: TEST FAILED" << std::endl; return ((success && (rc==NSSI_OK)) ? 0 : 1 ); }
int main(int argc, char *argv[]) { // Create output stream. (Handy for multicore output.) auto out = Teuchos::VerboseObjectBase::getDefaultOStream(); Teuchos::GlobalMPISession session(&argc, &argv, NULL); auto comm = Teuchos::DefaultComm<int>::getComm(); // Wrap the whole code in a big try-catch-statement. bool success = true; try { // ========================================================================= // Handle command line arguments. // Boost::program_options is somewhat more complete here (e.g. you can // specify options without the "--" syntax), but it isn't less complicated // to use. Stick with Teuchos for now. Teuchos::CommandLineProcessor myClp; myClp.setDocString( "Numerical parameter continuation for nonlinear Schr\"odinger equations.\n" ); std::string xmlInputPath = ""; myClp.setOption("xml-input-file", &xmlInputPath, "XML file containing the parameter list", true ); // Print warning for unrecognized arguments and make sure to throw an // exception if something went wrong. //myClp.throwExceptions(false); //myClp.recogniseAllOptions ( true ); // Finally, parse the command line. myClp.parse(argc, argv); // Retrieve Piro parameter list from given file. std::shared_ptr<Teuchos::ParameterList> piroParams( new Teuchos::ParameterList() ); Teuchos::updateParametersFromXmlFile( xmlInputPath, Teuchos::rcp(piroParams).ptr() ); // ======================================================================= // Extract the location of input and output files. const Teuchos::ParameterList outputList = piroParams->sublist("Output", true); // Set default directory to be the directory of the XML file itself const std::string xmlDirectory = xmlInputPath.substr(0, xmlInputPath.find_last_of( "/" ) + 1); // By default, take the current directory. std::string prefix = "./"; if (!xmlDirectory.empty()) prefix = xmlDirectory + "/"; const std::string outputDirectory = prefix; const std::string contFilePath = prefix + outputList.get<std::string>("Continuation data file name"); Teuchos::ParameterList & inputDataList = piroParams->sublist("Input", true); const std::string inputExodusFile = prefix + inputDataList.get<std::string>("File"); const int step = inputDataList.get<int>("Initial Psi Step"); //const bool useBordering = piroParams->get<bool>("Bordering"); // ======================================================================= // Read the data from the file. auto mesh = std::make_shared<Nosh::StkMesh>( Teuchos::get_shared_ptr(comm), inputExodusFile, step ); // Cast the data into something more accessible. auto psi = mesh->getComplexVector("psi"); //psi->Random(); // Set the output directory for later plotting with this. std::stringstream outputFile; outputFile << outputDirectory << "/solution.e"; mesh->openOutputChannel(outputFile.str()); // Create a parameter map from the initial parameter values. Teuchos::ParameterList initialParameterValues = piroParams->sublist("Initial parameter values", true); // Check if we need to interpret the time value stored in the file // as a parameter. const std::string & timeName = piroParams->get<std::string>("Interpret time as", ""); if (!timeName.empty()) { initialParameterValues.set(timeName, mesh->getTime()); } // Explicitly set the initial parameter value for this list. const std::string & paramName = piroParams->sublist( "LOCA" ) .sublist( "Stepper" ) .get<std::string>("Continuation Parameter"); *out << "Setting the initial parameter value of \"" << paramName << "\" to " << initialParameterValues.get<double>(paramName) << "." << std::endl; piroParams->sublist( "LOCA" ) .sublist( "Stepper" ) .set("Initial Value", initialParameterValues.get<double>(paramName)); // Set the thickness field. auto thickness = std::make_shared<Nosh::ScalarField::Constant>(*mesh, 1.0); // Some alternatives for the positive-definite operator. // (a) -\Delta (Laplace operator with Neumann boundary) //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> matrixBuilder = // rcp(new Nosh::ParameterMatrix::Laplace(mesh, thickness)); // (b) (-i\nabla-A)^2 (Kinetic energy of a particle in magnetic field) // (b1) 'A' explicitly given in file. const double mu = initialParameterValues.get<double>("mu"); auto mvp = std::make_shared<Nosh::VectorField::ExplicitValues>(*mesh, "A", mu); //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> keoBuilder( // new Nosh::ParameterMatrix::Keo(mesh, thickness, mvp) // ); //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> DKeoDPBuilder( // new Nosh::ParameterMatrix::DKeoDP(mesh, thickness, mvp, "mu") // ); // (b2) 'A' analytically given (here with constant curl). // Optionally add a rotation axis u. This is important // if continuation happens as a rotation of the vector // field around an axis. //const std::shared_ptr<DoubleVector> b = rcp(new DoubleVector(3)); //std::shared_ptr<Teuchos::SerialDenseVector<int,double> > u = Teuchos::null; //if ( piroParams->isSublist("Rotation vector") ) //{ // u = rcp(new Teuchos::SerialDenseVector<int,double>(3)); // Teuchos::ParameterList & rotationVectorList = // piroParams->sublist( "Rotation vector", false ); // (*u)[0] = rotationVectorList.get<double>("x"); // (*u)[1] = rotationVectorList.get<double>("y"); // (*u)[2] = rotationVectorList.get<double>("z"); //} //std::shared_ptr<Nosh::VectorField::Virtual> mvp = // rcp(new Nosh::VectorField::ConstantCurl(mesh, b, u)); //const std::shared_ptr<Nosh::ParameterMatrix::Virtual> matrixBuilder = // rcp(new Nosh::ParameterMatrix::Keo(mesh, thickness, mvp)); // (b3) 'A' analytically given in a class you write yourself, derived // from Nosh::ParameterMatrix::Virtual. // [...] // // Setup the scalar potential V. // (a) A constant potential. //std::shared_ptr<Nosh::ScalarField::Virtual> sp = //rcp(new Nosh::ScalarField::Constant(*mesh, -1.0)); //const double T = initialParameterValues.get<double>("T"); // (b) With explicit values. //std::shared_ptr<Nosh::ScalarField::Virtual> sp = //rcp(new Nosh::ScalarField::ExplicitValues(*mesh, "V")); // (c) One you built yourself by deriving from Nosh::ScalarField::Virtual. auto sp = std::make_shared<MyScalarField>(mesh); const double g = initialParameterValues.get<double>("g"); // Finally, create the model evaluator. // This is the most important object in the whole stack. auto modelEvaluator = std::make_shared<Nosh::ModelEvaluator::Nls>( mesh, mvp, sp, g, thickness, psi, "mu" ); // Build the Piro model evaluator. It's used to hook up with // several different backends (NOX, LOCA, Rhythmos,...). std::shared_ptr<Thyra::ModelEvaluator<double>> piro; // Declare the eigensaver; it will be used only for LOCA solvers, though. std::shared_ptr<Nosh::SaveEigenData> glEigenSaver; // Switch by solver type. std::string & solver = piroParams->get<std::string>("Piro Solver"); if (solver == "NOX") { auto observer = std::make_shared<Nosh::Observer>(modelEvaluator); piro = std::make_shared<Piro::NOXSolver<double>>( Teuchos::rcp(piroParams), Teuchos::rcp(modelEvaluator), Teuchos::rcp(observer) ); } else if (solver == "LOCA") { auto observer = std::make_shared<Nosh::Observer>( modelEvaluator, contFilePath, piroParams->sublist("LOCA") .sublist("Stepper") .get<std::string>("Continuation Parameter") ); // Setup eigen saver. #ifdef HAVE_LOCA_ANASAZI bool computeEigenvalues = piroParams->sublist( "LOCA" ) .sublist( "Stepper" ) .get<bool>("Compute Eigenvalues"); if (computeEigenvalues) { Teuchos::ParameterList & eigenList = piroParams->sublist("LOCA") .sublist("Stepper") .sublist("Eigensolver"); std::string eigenvaluesFilePath = xmlDirectory + "/" + outputList.get<std::string> ( "Eigenvalues file name" ); glEigenSaver = std::make_shared<Nosh::SaveEigenData>( eigenList, modelEvaluator, eigenvaluesFilePath ); std::shared_ptr<LOCA::SaveEigenData::AbstractStrategy> glSaveEigenDataStrategy = glEigenSaver; eigenList.set("Save Eigen Data Method", "User-Defined"); eigenList.set("User-Defined Save Eigen Data Name", "glSaveEigenDataStrategy"); eigenList.set("glSaveEigenDataStrategy", glSaveEigenDataStrategy); } #endif // Get the solver. std::shared_ptr<Piro::LOCASolver<double>> piroLOCASolver( new Piro::LOCASolver<double>( Teuchos::rcp(piroParams), Teuchos::rcp(modelEvaluator), Teuchos::null //Teuchos::rcp(observer) ) ); // // Get stepper and inject it into the eigensaver. // std::shared_ptr<LOCA::Stepper> stepper = Teuchos::get_shared_ptr( // piroLOCASolver->getLOCAStepperNonConst() // ); //#ifdef HAVE_LOCA_ANASAZI // if (computeEigenvalues) // glEigenSaver->setLocaStepper(stepper); //#endif piro = piroLOCASolver; } #if 0 else if ( solver == "Turning Point" ) { std::shared_ptr<Nosh::Observer> observer; Teuchos::ParameterList & bifList = piroParams->sublist("LOCA").sublist("Bifurcation"); // Fetch the (approximate) null state. auto nullstateZ = mesh->getVector("null"); // Set the length normalization vector to be the initial null vector. TEUCHOS_ASSERT(nullstateZ); auto lengthNormVec = Teuchos::rcp(new NOX::Thyra::Vector(*nullstateZ)); //lengthNormVec->init(1.0); bifList.set("Length Normalization Vector", lengthNormVec); // Set the initial null vector. auto initialNullAbstractVec = Teuchos::rcp(new NOX::Thyra::Vector(*nullstateZ)); // initialNullAbstractVec->init(1.0); bifList.set("Initial Null Vector", initialNullAbstractVec); piro = std::make_shared<Piro::LOCASolver<double>>( Teuchos::rcp(piroParams), Teuchos::rcp(modelEvaluator), Teuchos::null //Teuchos::rcp(observer) ); } #endif else { TEUCHOS_TEST_FOR_EXCEPT_MSG( true, "Unknown solver type \"" << solver << "\"." ); } // ---------------------------------------------------------------------- // Now the setting of inputs and outputs. Thyra::ModelEvaluatorBase::InArgs<double> inArgs = piro->createInArgs(); inArgs.set_p( 0, piro->getNominalValues().get_p(0) ); // Set output arguments to evalModel call. Thyra::ModelEvaluatorBase::OutArgs<double> outArgs = piro->createOutArgs(); // Now solve the problem and return the responses. const Teuchos::RCP<Teuchos::Time> piroSolveTime = Teuchos::TimeMonitor::getNewTimer("Piro total solve time");; { Teuchos::TimeMonitor tm(*piroSolveTime); piro->evalModel(inArgs, outArgs); } // Manually release LOCA stepper. #ifdef HAVE_LOCA_ANASAZI if (glEigenSaver) glEigenSaver->releaseLocaStepper(); #endif // Print timing data. Teuchos::TimeMonitor::summarize(); } catch (Teuchos::CommandLineProcessor::HelpPrinted) { } catch (Teuchos::CommandLineProcessor::ParseError) { } TEUCHOS_STANDARD_CATCH_STATEMENTS(true, *out, success); return success ? EXIT_SUCCESS : EXIT_FAILURE; }
int main(int argc, char* argv[]) { int ierr = 0; try { double t, ta; int p = 2; int w = p+7; // Set up command line options Teuchos::CommandLineProcessor clp; clp.setDocString("This program tests the speed of various forward mode AD implementations for a single multiplication operation"); int nderiv = 10; clp.setOption("nderiv", &nderiv, "Number of derivative components"); int nloop = 1000000; clp.setOption("nloop", &nloop, "Number of loops"); // Parse options Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= clp.parse(argc, argv); if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) return 1; // Memory pool & manager Sacado::Fad::MemPoolManager<double> poolManager(10); Sacado::Fad::MemPool* pool = poolManager.getMemoryPool(nderiv); Sacado::Fad::DMFad<double>::setDefaultPool(pool); std::cout.setf(std::ios::scientific); std::cout.precision(p); std::cout << "Times (sec) for nderiv = " << nderiv << " nloop = " << nloop << ": " << std::endl; ta = do_time_analytic(nderiv, nloop); std::cout << "Analytic: " << std::setw(w) << ta << std::endl; t = do_time< FAD::TFad<10,double> >(nderiv, nloop); std::cout << "TFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< FAD::Fad<double> >(nderiv, nloop); std::cout << "Fad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::Fad::SFad<double,10> >(nderiv, nloop); std::cout << "SFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::Fad::SLFad<double,10> >(nderiv, nloop); std::cout << "SLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::Fad::DFad<double> >(nderiv, nloop); std::cout << "DFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::Fad::DMFad<double> >(nderiv, nloop); std::cout << "DMFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::ELRFad::SFad<double,10> >(nderiv, nloop); std::cout << "ELRSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::ELRFad::SLFad<double,10> >(nderiv, nloop); std::cout << "ELRSLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::ELRFad::DFad<double> >(nderiv, nloop); std::cout << "ELRDFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::CacheFad::DFad<double> >(nderiv, nloop); std::cout << "CacheFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; t = do_time< Sacado::Fad::DVFad<double> >(nderiv, nloop); std::cout << "DVFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << std::endl; } catch (std::exception& e) { std::cout << e.what() << std::endl; ierr = 1; } catch (const char *s) { std::cout << s << std::endl; ierr = 1; } catch (...) { std::cout << "Caught unknown exception!" << std::endl; ierr = 1; } return ierr; }
int main(int argc, char *argv[]) { bool success = true; bool verbose = false; try { const size_t num_sockets = Kokkos::hwloc::get_available_numa_count(); const size_t num_cores_per_socket = Kokkos::hwloc::get_available_cores_per_numa(); const size_t num_threads_per_core = Kokkos::hwloc::get_available_threads_per_core(); // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This test performance of MP::Vector multiply routines.\n"); int nGrid = 32; CLP.setOption("n", &nGrid, "Number of mesh points in the each direction"); int nIter = 10; CLP.setOption("ni", &nIter, "Number of multiply iterations"); int ensemble_min = 4; CLP.setOption("emin", &ensemble_min, "Staring ensemble size"); int ensemble_max = 24; CLP.setOption("emax", &ensemble_max, "Stoping ensemble size"); int ensemble_step = 4; CLP.setOption("estep", &ensemble_step, "Ensemble increment"); #ifdef KOKKOS_HAVE_PTHREAD bool threads = true; CLP.setOption("threads", "no-threads", &threads, "Enable Threads device"); int num_cores = num_cores_per_socket * num_sockets; CLP.setOption("cores", &num_cores, "Number of CPU cores to use (defaults to all)"); int num_hyper_threads = num_threads_per_core; CLP.setOption("hyperthreads", &num_hyper_threads, "Number of hyper threads per core to use (defaults to all)"); #endif #ifdef KOKKOS_HAVE_CUDA bool cuda = true; CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device"); int device_id = 0; CLP.setOption("device", &device_id, "CUDA device ID"); #endif CLP.parse( argc, argv ); typedef int Ordinal; typedef double Scalar; #ifdef KOKKOS_HAVE_PTHREAD if (threads) { typedef Kokkos::Threads Device; Kokkos::Threads::initialize(num_cores*num_hyper_threads); std::cout << std::endl << "Threads performance with " << num_cores*num_hyper_threads << " threads:" << std::endl; performance_test_driver<Scalar,Ordinal,Device>( nGrid, nIter, ensemble_min, ensemble_max, ensemble_step); Kokkos::Threads::finalize(); } #endif #ifdef KOKKOS_HAVE_CUDA if (cuda) { typedef Kokkos::Cuda Device; Kokkos::HostSpace::execution_space::initialize(); Kokkos::Cuda::initialize(Kokkos::Cuda::SelectDevice(device_id)); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device_id); std::cout << std::endl << "CUDA performance for device " << device_id << " (" << deviceProp.name << "):" << std::endl; performance_test_driver<Scalar,Ordinal,Device>( nGrid, nIter, ensemble_min, ensemble_max, ensemble_step); Kokkos::HostSpace::execution_space::finalize(); Kokkos::Cuda::finalize(); } #endif } TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success); if (success) return 0; return -1; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program measure the performance of dense Herk on Kokkos::Threads execution space.\n"); int nthreads = 0; clp.setOption("nthreads", &nthreads, "Number of threads"); int numa = 0; clp.setOption("numa", &numa, "Number of numa node"); int core_per_numa = 0; clp.setOption("core-per-numa", &core_per_numa, "Number of cores per numa node"); int max_concurrency = 250000; clp.setOption("max-concurrency", &max_concurrency, "Max number of concurrent tasks"); int memory_pool_grain_size = 16; clp.setOption("memory-pool-grain-size", &memory_pool_grain_size, "Memorypool chunk size (12 - 16)"); int mkl_nthreads = 1; clp.setOption("mkl-nthreads", &mkl_nthreads, "MKL threads for nested parallelism"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); int mmin = 1000; clp.setOption("mmin", &mmin, "C(mmin,mmin)"); int mmax = 8000; clp.setOption("mmax", &mmax, "C(mmax,mmax)"); int minc = 1000; clp.setOption("minc", &minc, "Increment of m"); int k = 1024; clp.setOption("k", &k, "A(mmax,k) or A(k,mmax) according to transpose flags"); int mb = 256; clp.setOption("mb", &mb, "Blocksize"); bool check = true; clp.setOption("enable-check", "disable-check", &check, "Flag for check solution"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { exec_space::initialize(nthreads, numa, core_per_numa); std::cout << std::endl << "DenseHerkByBlocks:: Upper, ConjTranspose, Variant::One (external)" << std::endl; r_val = exampleDenseHerkByBlocks <Uplo::Upper,Trans::ConjTranspose,Variant::One,exec_space> (mmin, mmax, minc, k, mb, max_concurrency, memory_pool_grain_size, mkl_nthreads, check, verbose); exec_space::finalize(); } return r_val; }
int main(int argc, char **argv) { try { // Initialize MPI #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example generates the sparsity pattern for the block stochastic Galerkin matrix.\n"); int d = 3; CLP.setOption("dimension", &d, "Stochastic dimension"); int p = 5; CLP.setOption("order", &p, "Polynomial order"); double drop = 1.0e-12; CLP.setOption("drop", &drop, "Drop tolerance"); std::string file = "A.mm"; CLP.setOption("filename", &file, "Matrix Market filename"); BasisType basis_type = LEGENDRE; CLP.setOption("basis", &basis_type, num_basis_types, basis_type_values, basis_type_names, "Basis type"); Stokhos::GrowthPolicy growth_type = Stokhos::SLOW_GROWTH; CLP.setOption("growth", &growth_type, num_growth_types, growth_type_values, growth_type_names, "Growth type"); ProductBasisType prod_basis_type = COMPLETE; CLP.setOption("product_basis", &prod_basis_type, num_prod_basis_types, prod_basis_type_values, prod_basis_type_names, "Product basis type"); double alpha = 1.0; CLP.setOption("alpha", &alpha, "Jacobi alpha index"); double beta = 1.0; CLP.setOption("beta", &beta, "Jacobi beta index"); bool full = true; CLP.setOption("full", "linear", &full, "Use full or linear expansion"); bool use_old = false; CLP.setOption("old", "new", &use_old, "Use old or new Cijk algorithm"); int tile_size = 100; CLP.setOption("tile_size", &tile_size, "Tile size"); // Parse arguments CLP.parse( argc, argv ); // Basis Array< RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); for (int i=0; i<d; i++) { if (basis_type == HERMITE) bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>( p, true, growth_type)); else if (basis_type == LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>( p, true, growth_type)); else if (basis_type == CC_LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::ClenshawCurtisLegendreBasis<int,double>( p, true)); else if (basis_type == GP_LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::GaussPattersonLegendreBasis<int,double>( p, true)); else if (basis_type == RYS) bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>( p, 1.0, true, growth_type)); else if (basis_type == JACOBI) bases[i] = Teuchos::rcp(new Stokhos::JacobiBasis<int,double>( p, alpha, beta, true, growth_type)); } RCP<const Stokhos::ProductBasis<int,double> > basis; if (prod_basis_type == COMPLETE) basis = Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>( bases, drop, use_old)); else if (prod_basis_type == TENSOR) basis = Teuchos::rcp(new Stokhos::TensorProductBasis<int,double>( bases, drop)); else if (prod_basis_type == TOTAL) basis = Teuchos::rcp(new Stokhos::TotalOrderBasis<int,double>( bases, drop)); else if (prod_basis_type == SMOLYAK) { Stokhos::TotalOrderIndexSet<int> index_set(d, p); basis = Teuchos::rcp(new Stokhos::SmolyakBasis<int,double>( bases, index_set, drop)); } // Triple product tensor typedef Stokhos::Sparse3Tensor<int,double> Cijk_type; RCP<Cijk_type> Cijk; if (full) Cijk = basis->computeTripleProductTensor(); else Cijk = basis->computeLinearTripleProductTensor(); int sz = basis->size(); std::cout << "basis size = " << sz << " num nonzero Cijk entries = " << Cijk->num_entries() << std::endl; // Setup tiles if (tile_size > sz) tile_size = sz; int j_sz = sz; int k_sz = sz; if (!full) k_sz = basis->dimension()+1; int nj_tiles = j_sz / tile_size; int nk_tiles = k_sz / tile_size; if (j_sz - nj_tiles*tile_size > 0) ++nj_tiles; if (k_sz - nk_tiles*tile_size > 0) ++nk_tiles; Array<CijkNonzeros> nz(sz); for (int i=0; i<sz; ++i) { nz[i].i = i; nz[i].nz_tiles.resize(nj_tiles); for (int j=0; j<nj_tiles; ++j) nz[i].nz_tiles[j].resize(nk_tiles); } // Get number of nonzeros in Cijk for each i Cijk_type::k_iterator k_begin = Cijk->k_begin(); Cijk_type::k_iterator k_end = Cijk->k_end(); for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { int k = index(k_it); int k_tile = k / tile_size; Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk->j_end(k_it); for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); int j_tile = j / tile_size; Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it); Cijk_type::kji_iterator i_end = Cijk->i_end(j_it); for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it) { int i = index(i_it); ++nz[i].total_nz; ++nz[i].nz_tiles[j_tile][k_tile]; } } } // Sort based on total number of nonzeros std::sort(nz.begin(), nz.end(), NZCompare()); // Print nonzeros int w_index = 3; int w_nz = 5; int w_tile = 4; for (int i=0; i<nz.size(); ++i) { int idx = nz[i].i; std::cout << std::setw(w_index) << idx << " " << basis->term(idx) << ": " << std::setw(w_nz) << nz[i].total_nz << ", "; for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) std::cout << std::setw(w_tile) << nz[i].nz_tiles[j][k] << " "; std::cout << std::endl; } // Add up the nonzeros for each (j,k) tile Array< Array<int> > total_nz_tiles(nj_tiles); int total_nz = 0; for (int j=0; j<nj_tiles; ++j) total_nz_tiles[j].resize(nk_tiles); for (int i=0; i<nz.size(); ++i) { total_nz += nz[i].total_nz; for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) total_nz_tiles[j][k] += nz[i].nz_tiles[j][k]; } int w_total = (w_index+1) + (2*basis->dimension()+5) + w_nz; std::cout << std::endl << std::setw(w_total) << total_nz << ", "; for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) std::cout << std::setw(w_tile) << total_nz_tiles[j][k] << " "; std::cout << std::endl; // Now partition Cijk for each tile Array< Array< RCP<Cijk_type> > > Cijk_tile(nj_tiles); for (int j=0; j<nj_tiles; ++j) { Cijk_tile[j].resize(nk_tiles); for (int k=0; k<nk_tiles; ++k) Cijk_tile[j][k] = rcp(new Cijk_type); } for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { int k = index(k_it); int k_tile = k / tile_size; Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk->j_end(k_it); for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); int j_tile = j / tile_size; Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it); Cijk_type::kji_iterator i_end = Cijk->i_end(j_it); for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it) { int i = index(i_it); double c = value(i_it); Cijk_tile[j_tile][k_tile]->add_term(i,j,k,c); } } } for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) Cijk_tile[j][k]->fillComplete(); Array< Array< std::map<int,int> > > nz_tile(nj_tiles); Array< Array< Array< std::pair<int,int> > > > sorted_nz_tile(nj_tiles); for (int j_tile=0; j_tile<nj_tiles; ++j_tile) { nz_tile[j_tile].resize(nk_tiles); sorted_nz_tile[j_tile].resize(nk_tiles); for (int k_tile=0; k_tile<nk_tiles; ++k_tile) { // Count nonzeros for each i, for each tile Cijk_type::k_iterator k_begin = Cijk_tile[j_tile][k_tile]->k_begin(); Cijk_type::k_iterator k_end = Cijk_tile[j_tile][k_tile]->k_end(); for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { //int k = index(k_it); Cijk_type::kj_iterator j_begin = Cijk_tile[j_tile][k_tile]->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk_tile[j_tile][k_tile]->j_end(k_it); for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { //int j = index(j_it); Cijk_type::kji_iterator i_begin = Cijk_tile[j_tile][k_tile]->i_begin(j_it); Cijk_type::kji_iterator i_end = Cijk_tile[j_tile][k_tile]->i_end(j_it); for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it){ int i = index(i_it); if (nz_tile[j_tile][k_tile].count(i) == 0) nz_tile[j_tile][k_tile][i] = 1; else ++(nz_tile[j_tile][k_tile][i]); } } } // Sort based on non-zeros for each i, for each tile sorted_nz_tile[j_tile][k_tile].resize(nz_tile[j_tile][k_tile].size()); int idx=0; for (std::map<int,int>::iterator it = nz_tile[j_tile][k_tile].begin(); it != nz_tile[j_tile][k_tile].end(); ++it) { sorted_nz_tile[j_tile][k_tile][idx] = std::make_pair(it->first, it->second); ++idx; } std::sort( sorted_nz_tile[j_tile][k_tile].begin(), sorted_nz_tile[j_tile][k_tile].end(), NZPairCompare() ); // Print number of non-zeros for each i, for each tile std::cout << std::endl << "Tile (" << j_tile << ", " << k_tile << "):" << std::endl; for (int i=0; i<sorted_nz_tile[j_tile][k_tile].size(); ++i) { int idx = sorted_nz_tile[j_tile][k_tile][i].first; std::cout << std::setw(w_index) << idx << " " << basis->term(idx) << ": " << std::setw(w_nz) << sorted_nz_tile[j_tile][k_tile][i].second << std::endl; if (i % 32 == 31) std::cout << std::endl; } } } } catch (std::exception& e) { std::cout << e.what() << std::endl; } return 0; }
int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program measure the performance of Chol algorithms on Kokkos::Threads execution space.\n"); int nthreads = 1; clp.setOption("nthreads", &nthreads, "Number of threads"); int max_task_dependence = 10; clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence"); int team_size = 1; clp.setOption("team-size", &team_size, "Team size"); int fill_level = 0; clp.setOption("fill-level", &fill_level, "Fill level"); bool team_interface = true; clp.setOption("enable-team-interface", "disable-team-interface", &team_interface, "Flag for team interface"); bool mkl_interface = false; clp.setOption("enable-mkl-interface", "disable-mkl-interface", &mkl_interface, "Flag for MKL interface"); int stack_size = 8192; clp.setOption("stack-size", &stack_size, "Stack size"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int treecut = 15; clp.setOption("treecut", &treecut, "Level to cut tree from bottom"); int minblksize = 0; clp.setOption("minblksize", &minblksize, "Minimum block size for internal reordering"); int prunecut = 0; clp.setOption("prunecut", &prunecut, "Leve to prune tree from bottom"); int seed = 0; clp.setOption("seed", &seed, "Seed for random number generator in graph partition"); int niter = 10; clp.setOption("niter", &niter, "Number of iterations for testing"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { const bool overwrite = true; const int nshepherds = (team_interface ? nthreads/team_size : nthreads); const int nworker_per_shepherd = nthreads/nshepherds; setenv("QT_HWPAR", to_string(nthreads).c_str(), overwrite); setenv("QT_NUM_SHEPHERDS", to_string(nshepherds).c_str(), overwrite); setenv("QT_NUM_WORKERS_PER_SHEPHERD", to_string(nworker_per_shepherd).c_str(), overwrite); setenv("QT_STACK_SIZE", to_string(stack_size).c_str(), overwrite); exec_space::initialize(nthreads); exec_space::print_configuration(cout, true); r_val = exampleCholPerformance <value_type,ordinal_type,size_type,exec_space,void> (file_input, treecut, minblksize, prunecut, seed, niter, nthreads, max_task_dependence, team_size, fill_level, nshepherds, team_interface, (nthreads != 1), mkl_interface, verbose); exec_space::finalize(); unsetenv("QT_HWPAR"); unsetenv("QT_NUM_SHEPHERDS"); unsetenv("QT_NUM_WORKERS_PER_SHEPHERD"); unsetenv("QT_STACK_SIZE"); } return r_val; }
int main(int argc, char* argv[]) { int ierr = 0; try { double t, tb; int p = 2; int w = p+7; // Set up command line options Teuchos::CommandLineProcessor clp; clp.setDocString("This program tests the speed of differentiating BLAS routines using Fad"); int m = 10; clp.setOption("m", &m, "Number of rows"); int n = 10; clp.setOption("n", &n, "Number of columns"); int k = 10; clp.setOption("k", &k, "Number of columns for GEMM"); int ndot = 10; clp.setOption("ndot", &ndot, "Number of derivative components"); int nloop = 100000; clp.setOption("nloop", &nloop, "Number of loops"); int dynamic = 1; clp.setOption("dynamic", &dynamic, "Use dynamic allocation"); // Parse options Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= clp.parse(argc, argv); if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) return 1; bool use_dynamic = (dynamic != 0); std::cout.setf(std::ios::scientific); std::cout.precision(p); std::cout << "Times (sec) for m = " << m << ", n = " << n << ", ndot = " << ndot << ", nloop = " << nloop << ", dynamic = " << use_dynamic << ": " << std::endl; tb = do_time_teuchos_double_gemm(m,n,k,nloop); std::cout << "GEMM: " << std::setw(w) << tb << std::endl; t = do_time_sacado_fad_gemm< Sacado::Fad::DVFad<double> >(m,n,k,ndot,nloop,use_dynamic); std::cout << "Sacado DVFad GEMM: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; t = do_time_sacado_fad_gemm< Sacado::Fad::DFad<double> >(m,n,k,ndot,nloop,use_dynamic); std::cout << "Sacado DFad GEMM: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; t = do_time_teuchos_fad_gemm< Sacado::Fad::DFad<double> >(m,n,k,ndot,nloop); std::cout << "Teuchos DFad GEMM: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; // t = do_time_teuchos_fad_gemm< Sacado::ELRFad::DFad<double> >(m,n,k,ndot,nloop); // std::cout << "Teuchos ELRDFad GEMM: " << std::setw(w) << t << "\t" // << std::setw(w) << t/tb << std::endl; t = do_time_teuchos_fad_gemm< Sacado::Fad::DVFad<double> >(m,n,k,ndot,nloop); std::cout << "Teuchos DVFad GEMM: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; std::cout << std::endl; tb = do_time_teuchos_double_gemv(m,n,nloop); std::cout << "GEMV: " << std::setw(w) << tb << std::endl; t = do_time_sacado_fad_gemv< Sacado::Fad::DVFad<double> >(m,n,ndot,nloop*10,use_dynamic); std::cout << "Sacado DVFad GEMV: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; t = do_time_sacado_fad_gemv< Sacado::Fad::DFad<double> >(m,n,ndot,nloop*10,use_dynamic); std::cout << "Sacado DFad GEMV: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; t = do_time_teuchos_fad_gemv< Sacado::Fad::DFad<double> >(m,n,ndot,nloop*10); std::cout << "Teuchos DFad GEMV: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; // t = do_time_teuchos_fad_gemv< Sacado::ELRFad::DFad<double> >(m,n,ndot,nloop*10); // std::cout << "Teuchos ELRDFad GEMV: " << std::setw(w) << t << "\t" // << std::setw(w) << t/tb << std::endl; t = do_time_teuchos_fad_gemv< Sacado::Fad::DVFad<double> >(m,n,ndot,nloop*10); std::cout << "Teuchos DVFad GEMV: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; std::cout << std::endl; tb = do_time_teuchos_double_dot(m,nloop*100); std::cout << "DOT: " << std::setw(w) << tb << std::endl; t = do_time_sacado_fad_dot< Sacado::Fad::DVFad<double> >(m,ndot,nloop*100,use_dynamic); std::cout << "Sacado DVFad DOT: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; t = do_time_sacado_fad_dot< Sacado::Fad::DFad<double> >(m,ndot,nloop*100,use_dynamic); std::cout << "Sacado DFad DOT: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; t = do_time_teuchos_fad_dot< Sacado::Fad::DFad<double> >(m,ndot,nloop*100); std::cout << "Teuchos DFad DOT: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; // t = do_time_teuchos_fad_dot< Sacado::ELRFad::DFad<double> >(m,ndot,nloop*100); // std::cout << "Teuchos ELRDFad DOT: " << std::setw(w) << t << "\t" // << std::setw(w) << t/tb << std::endl; t = do_time_teuchos_fad_dot< Sacado::Fad::DVFad<double> >(m,ndot,nloop*100); std::cout << "Teuchos DVFad DOT: " << std::setw(w) << t << "\t" << std::setw(w) << t/tb << std::endl; } catch (std::exception& e) { std::cout << e.what() << std::endl; ierr = 1; } catch (const char *s) { std::cout << s << std::endl; ierr = 1; } catch (...) { std::cout << "Caught unknown exception!" << std::endl; ierr = 1; } return ierr; }
int main(int argc, char* argv[]) { int ierr = 0; try { double t, ta, tr; int p = 2; int w = p+7; // Maximum number of derivative components for SLFad const int slfad_max = 130; // Set up command line options Teuchos::CommandLineProcessor clp; clp.setDocString("This program tests the speed of various forward mode AD implementations for a finite-element-like Jacobian fill"); int num_nodes = 100000; int num_eqns = 2; int rt = 0; clp.setOption("n", &num_nodes, "Number of nodes"); clp.setOption("p", &num_eqns, "Number of equations"); clp.setOption("rt", &rt, "Include ADOL-C retaping test"); // Parse options Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= clp.parse(argc, argv); if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) return 1; double mesh_spacing = 1.0 / static_cast<double>(num_nodes - 1); // Memory pool & manager Sacado::Fad::MemPoolManager<double> poolManager(num_nodes*num_eqns); Sacado::Fad::MemPool* pool = poolManager.getMemoryPool(num_nodes*num_eqns); Sacado::Fad::DMFad<double>::setDefaultPool(pool); std::cout.setf(std::ios::scientific); std::cout.precision(p); std::cout << "num_nodes = " << num_nodes << ", num_eqns = " << num_eqns << ": " << std::endl << " " << " Time " << "\t"<< "Time/Analytic" << "\t" << "Time/(2*p*Residual)" << std::endl; ta = 1.0; tr = 1.0; tr = residual_fill(num_nodes, num_eqns, mesh_spacing); ta = analytic_jac_fill(num_nodes, num_eqns, mesh_spacing); std::cout << "Analytic: " << std::setw(w) << ta << "\t" << std::setw(w) << ta/ta << "\t" << std::setw(w) << ta/(2.0*num_eqns*tr) << std::endl; #ifdef HAVE_ADOLC #ifndef ADOLC_TAPELESS t = adolc_jac_fill(num_nodes, num_eqns, mesh_spacing); std::cout << "ADOL-C: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; if (rt != 0) { t = adolc_retape_jac_fill(num_nodes, num_eqns, mesh_spacing); std::cout << "ADOL-C(rt): " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } #else t = adolc_tapeless_jac_fill(num_nodes, num_eqns, mesh_spacing); std::cout << "ADOL-C(tl): " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; #endif #endif #ifdef HAVE_ADIC t = adic_jac_fill(num_nodes, num_eqns, mesh_spacing); std::cout << "ADIC: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; #endif if (num_eqns*2 == 4) { t = fad_jac_fill< FAD::TFad<16,double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "TFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 16) { t = fad_jac_fill< FAD::TFad<16,double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "TFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 32) { t = fad_jac_fill< FAD::TFad<32,double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "TFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 64) { t = fad_jac_fill< FAD::TFad<64,double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "TFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } t = fad_jac_fill< FAD::Fad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "Fad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; if (num_eqns*2 == 4) { t = fad_jac_fill< Sacado::Fad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing); std::cout << "SFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 16) { t = fad_jac_fill< Sacado::Fad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing); std::cout << "SFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 32) { t = fad_jac_fill< Sacado::Fad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing); std::cout << "SFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 64) { t = fad_jac_fill< Sacado::Fad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing); std::cout << "SFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } if (num_eqns*2 < slfad_max) { t = fad_jac_fill< Sacado::Fad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing); std::cout << "SLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } t = fad_jac_fill< Sacado::Fad::DFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "DFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; t = fad_jac_fill< Sacado::Fad::SimpleFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "SimpleFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; t = fad_jac_fill< Sacado::Fad::DMFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "DMFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; if (num_eqns*2 == 4) { t = fad_jac_fill< Sacado::ELRFad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 16) { t = fad_jac_fill< Sacado::ELRFad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 32) { t = fad_jac_fill< Sacado::ELRFad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 64) { t = fad_jac_fill< Sacado::ELRFad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } if (num_eqns*2 < slfad_max) { t = fad_jac_fill< Sacado::ELRFad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRSLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } t = fad_jac_fill< Sacado::ELRFad::DFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRDFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; if (num_eqns*2 == 4) { t = fad_jac_fill< Sacado::CacheFad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing); std::cout << "CacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 16) { t = fad_jac_fill< Sacado::CacheFad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing); std::cout << "CacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 32) { t = fad_jac_fill< Sacado::CacheFad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing); std::cout << "CacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 64) { t = fad_jac_fill< Sacado::CacheFad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing); std::cout << "CacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } if (num_eqns*2 < slfad_max) { t = fad_jac_fill< Sacado::CacheFad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing); std::cout << "CacheSLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } t = fad_jac_fill< Sacado::CacheFad::DFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "CacheFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; if (num_eqns*2 == 4) { t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,4> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRCacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 16) { t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,16> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRCacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 32) { t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,32> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRCacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } else if (num_eqns*2 == 64) { t = fad_jac_fill< Sacado::ELRCacheFad::SFad<double,64> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRCacheSFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } if (num_eqns*2 < slfad_max) { t = fad_jac_fill< Sacado::ELRCacheFad::SLFad<double,slfad_max> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRCacheSLFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } t = fad_jac_fill< Sacado::ELRCacheFad::DFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "ELRCacheFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; t = fad_jac_fill< Sacado::Fad::DVFad<double> >(num_nodes, num_eqns, mesh_spacing); std::cout << "DVFad: " << std::setw(w) << t << "\t" << std::setw(w) << t/ta << "\t" << std::setw(w) << t/(2.0*num_eqns*tr) << std::endl; } catch (std::exception& e) { std::cout << e.what() << std::endl; ierr = 1; } catch (const char *s) { std::cout << s << std::endl; ierr = 1; } catch (...) { std::cout << "Caught unknown exception!" << std::endl; ierr = 1; } return ierr; }
int main(int argc, char **argv) { // Typename of Polynomial Chaos scalar type typedef Stokhos::StandardStorage<int,double> pce_storage_type; typedef Sacado::ETPCE::OrthogPoly<double, pce_storage_type> pce_type; // Typename of ensemble scalar type const int EnsembleSize = 8; typedef Stokhos::StaticFixedStorage<int,double,EnsembleSize,Kokkos::DefaultExecutionSpace> ensemble_storage_type; typedef Sacado::MP::Vector<ensemble_storage_type> ensemble_type; // Short-hand for several classes used below using Teuchos::Array; using Teuchos::RCP; using Teuchos::rcp; using Stokhos::OneDOrthogPolyBasis; using Stokhos::HermiteBasis; using Stokhos::LegendreBasis; using Stokhos::CompletePolynomialBasis; using Stokhos::Quadrature; using Stokhos::TotalOrderIndexSet; using Stokhos::SmolyakSparseGridQuadrature; using Stokhos::TensorProductQuadrature; using Stokhos::Sparse3Tensor; using Stokhos::QuadOrthogPolyExpansion; try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example computes the PC expansion of a simple function.\n"); int p = 4; CLP.setOption("order", &p, "Polynomial order"); bool sparse = false; CLP.setOption("sparse", "tensor", &sparse, "Use sparse grid or tensor product quadrature"); // Parse arguments CLP.parse( argc, argv ); // Basis of dimension 3, order given by command-line option const int d = 3; Array< RCP<const OneDOrthogPolyBasis<int,double> > > bases(d); for (int i=0; i<d; i++) { bases[i] = rcp(new HermiteBasis<int,double>(p, true)); } RCP<const CompletePolynomialBasis<int,double> > basis = rcp(new CompletePolynomialBasis<int,double>(bases)); const int pce_size = basis->size(); std::cout << "basis size = " << pce_size << std::endl; // Quadrature method RCP<const Quadrature<int,double> > quad; if (sparse) { const TotalOrderIndexSet<int> index_set(d, p); quad = rcp(new SmolyakSparseGridQuadrature<int,double>(basis, index_set)); } else { quad = rcp(new TensorProductQuadrature<int,double>(basis)); } std::cout << "quadrature size = " << quad->size() << std::endl; // Triple product tensor RCP<Sparse3Tensor<int,double> > Cijk = basis->computeTripleProductTensor(); // Expansion method RCP<QuadOrthogPolyExpansion<int,double> > expn = rcp(new QuadOrthogPolyExpansion<int,double>(basis, Cijk, quad)); // Polynomial expansion of u (note: these are coefficients in the // normalized basis) pce_type u(expn); u.term(0,0) = 1.0; // zeroth order term u.term(0,1) = 0.1; // first order term for dimension 0 u.term(1,1) = 0.05; // first order term for dimension 1 u.term(2,1) = 0.01; // first order term for dimension 2 // // Compute PCE expansion of function using NISP with ensemble propagation // // Extract quadrature data const int num_quad_points = quad->size(); const Array<double>& quad_weights = quad->getQuadWeights(); const Array< Array<double> >& quad_points = quad->getQuadPoints(); const Array< Array<double> >& quad_values = quad->getBasisAtQuadPoints(); // Loop over quadrature points in blocks of size EnsembleSize pce_type v(expn); ensemble_type u_ensemble; for (int qp_block=0; qp_block<num_quad_points; qp_block+=EnsembleSize) { const int qp_sz = qp_block+EnsembleSize <= num_quad_points ? EnsembleSize : num_quad_points-qp_block; // Evaluate u at each quadrature point for (int qp=0; qp<qp_sz; ++qp) u_ensemble.fastAccessCoeff(qp) = u.evaluate(quad_points[qp_block+qp], quad_values[qp_block+qp]); for (int qp=qp_sz; qp<EnsembleSize; ++qp) u_ensemble.fastAccessCoeff(qp) = u_ensemble.fastAccessCoeff(qp_sz-1); // Evaluate function at each quadrature point ensemble_type v_ensemble = simple_function(u_ensemble); // Sum results into PCE integral for (int pc=0; pc<pce_size; ++pc) for (int qp=0; qp<qp_sz; ++qp) v.fastAccessCoeff(pc) += v_ensemble.fastAccessCoeff(qp)*quad_weights[qp_block+qp]*quad_values[qp_block+qp][pc]; } /* for (int qp=0; qp<num_quad_points; ++qp) { double u_qp = u.evaluate(quad_points[qp]); double v_qp = simple_function(u_qp); double w = quad_weights[qp]; for (int pc=0; pc<pce_size; ++pc) v.fastAccessCoeff(pc) += v_qp*w*quad_values[qp][pc]; } */ // Print u and v std::cout << "\tu = "; u.print(std::cout); std::cout << "\tv = "; v.print(std::cout); // Compute moments double mean = v.mean(); double std_dev = v.standard_deviation(); // Evaluate PCE and function at a point = 0.25 in each dimension Teuchos::Array<double> pt(d); for (int i=0; i<d; i++) pt[i] = 0.25; double up = u.evaluate(pt); double vp = simple_function(up); double vp2 = v.evaluate(pt); // Print results std::cout << "\tv mean = " << mean << std::endl; std::cout << "\tv std. dev. = " << std_dev << std::endl; std::cout << "\tv(0.25) (true) = " << vp << std::endl; std::cout << "\tv(0.25) (pce) = " << vp2 << std::endl; // Check the answer if (std::abs(vp - vp2) < 1e-2) std::cout << "\nExample Passed!" << std::endl; } catch (std::exception& e) { std::cout << e.what() << std::endl; } }