int main (int argc, char *argv[]) { Teuchos::CommandLineProcessor clp; clp.setDocString("This example program measure the performance of IChol algorithms on Kokkos::Threads execution space.\n"); int nthreads = 1; clp.setOption("nthreads", &nthreads, "Number of threads"); int max_task_dependence = 10; clp.setOption("max-task-dependence", &max_task_dependence, "Max number of task dependence"); int team_size = 1; clp.setOption("team-size", &team_size, "Team size"); bool team_interface = false; clp.setOption("enable-team-interface", "disable-team-interface", &team_interface, "Flag for team interface"); bool verbose = false; clp.setOption("enable-verbose", "disable-verbose", &verbose, "Flag for verbose printing"); string file_input = "test.mtx"; clp.setOption("file-input", &file_input, "Input file (MatrixMarket SPD matrix)"); int niter = 10; clp.setOption("niter", &niter, "Number of iterations for testing"); clp.recogniseAllOptions(true); clp.throwExceptions(false); Teuchos::CommandLineProcessor::EParseCommandLineReturn r_parse= clp.parse( argc, argv ); if (r_parse == Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED) return 0; if (r_parse != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL ) return -1; int r_val = 0; { const bool overwrite = true; const int nshepherds = (team_interface ? nthreads/team_size : nthreads); const int nworker_per_shepherd = nthreads/nshepherds; setenv("QT_HWPAR", to_string(nthreads).c_str(), overwrite); setenv("QT_NUM_SHEPHERDS", to_string(nshepherds).c_str(), overwrite); setenv("QT_NUM_WORKERS_PER_SHEPHERD", to_string(nworker_per_shepherd).c_str(), overwrite); exec_space::initialize(nthreads); exec_space::print_configuration(cout, true); // r_val = exampleICholPerformance // <value_type,ordinal_type,size_type,exec_space,void> // (file_input, niter, nthreads, max_task_dependence, team_size, team_interface, (nthreads != 1), verbose); exec_space::finalize(); unsetenv("QT_HWPAR"); unsetenv("QT_NUM_SHEPHERDS"); unsetenv("QT_NUM_WORKERS_PER_SHEPHERD"); } return r_val; }
int main(int argc, char **argv) { // Typename of Polynomial Chaos scalar type typedef Stokhos::StandardStorage<int,double> pce_storage_type; typedef Sacado::ETPCE::OrthogPoly<double, pce_storage_type> pce_type; // Typename of ensemble scalar type const int EnsembleSize = 8; typedef Stokhos::StaticFixedStorage<int,double,EnsembleSize,Kokkos::DefaultExecutionSpace> ensemble_storage_type; typedef Sacado::MP::Vector<ensemble_storage_type> ensemble_type; // Short-hand for several classes used below using Teuchos::Array; using Teuchos::RCP; using Teuchos::rcp; using Stokhos::OneDOrthogPolyBasis; using Stokhos::HermiteBasis; using Stokhos::LegendreBasis; using Stokhos::CompletePolynomialBasis; using Stokhos::Quadrature; using Stokhos::TotalOrderIndexSet; using Stokhos::SmolyakSparseGridQuadrature; using Stokhos::TensorProductQuadrature; using Stokhos::Sparse3Tensor; using Stokhos::QuadOrthogPolyExpansion; try { // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example computes the PC expansion of a simple function.\n"); int p = 4; CLP.setOption("order", &p, "Polynomial order"); bool sparse = false; CLP.setOption("sparse", "tensor", &sparse, "Use sparse grid or tensor product quadrature"); // Parse arguments CLP.parse( argc, argv ); // Basis of dimension 3, order given by command-line option const int d = 3; Array< RCP<const OneDOrthogPolyBasis<int,double> > > bases(d); for (int i=0; i<d; i++) { bases[i] = rcp(new HermiteBasis<int,double>(p, true)); } RCP<const CompletePolynomialBasis<int,double> > basis = rcp(new CompletePolynomialBasis<int,double>(bases)); const int pce_size = basis->size(); std::cout << "basis size = " << pce_size << std::endl; // Quadrature method RCP<const Quadrature<int,double> > quad; if (sparse) { const TotalOrderIndexSet<int> index_set(d, p); quad = rcp(new SmolyakSparseGridQuadrature<int,double>(basis, index_set)); } else { quad = rcp(new TensorProductQuadrature<int,double>(basis)); } std::cout << "quadrature size = " << quad->size() << std::endl; // Triple product tensor RCP<Sparse3Tensor<int,double> > Cijk = basis->computeTripleProductTensor(); // Expansion method RCP<QuadOrthogPolyExpansion<int,double> > expn = rcp(new QuadOrthogPolyExpansion<int,double>(basis, Cijk, quad)); // Polynomial expansion of u (note: these are coefficients in the // normalized basis) pce_type u(expn); u.term(0,0) = 1.0; // zeroth order term u.term(0,1) = 0.1; // first order term for dimension 0 u.term(1,1) = 0.05; // first order term for dimension 1 u.term(2,1) = 0.01; // first order term for dimension 2 // // Compute PCE expansion of function using NISP with ensemble propagation // // Extract quadrature data const int num_quad_points = quad->size(); const Array<double>& quad_weights = quad->getQuadWeights(); const Array< Array<double> >& quad_points = quad->getQuadPoints(); const Array< Array<double> >& quad_values = quad->getBasisAtQuadPoints(); // Loop over quadrature points in blocks of size EnsembleSize pce_type v(expn); ensemble_type u_ensemble; for (int qp_block=0; qp_block<num_quad_points; qp_block+=EnsembleSize) { const int qp_sz = qp_block+EnsembleSize <= num_quad_points ? EnsembleSize : num_quad_points-qp_block; // Evaluate u at each quadrature point for (int qp=0; qp<qp_sz; ++qp) u_ensemble.fastAccessCoeff(qp) = u.evaluate(quad_points[qp_block+qp], quad_values[qp_block+qp]); for (int qp=qp_sz; qp<EnsembleSize; ++qp) u_ensemble.fastAccessCoeff(qp) = u_ensemble.fastAccessCoeff(qp_sz-1); // Evaluate function at each quadrature point ensemble_type v_ensemble = simple_function(u_ensemble); // Sum results into PCE integral for (int pc=0; pc<pce_size; ++pc) for (int qp=0; qp<qp_sz; ++qp) v.fastAccessCoeff(pc) += v_ensemble.fastAccessCoeff(qp)*quad_weights[qp_block+qp]*quad_values[qp_block+qp][pc]; } /* for (int qp=0; qp<num_quad_points; ++qp) { double u_qp = u.evaluate(quad_points[qp]); double v_qp = simple_function(u_qp); double w = quad_weights[qp]; for (int pc=0; pc<pce_size; ++pc) v.fastAccessCoeff(pc) += v_qp*w*quad_values[qp][pc]; } */ // Print u and v std::cout << "\tu = "; u.print(std::cout); std::cout << "\tv = "; v.print(std::cout); // Compute moments double mean = v.mean(); double std_dev = v.standard_deviation(); // Evaluate PCE and function at a point = 0.25 in each dimension Teuchos::Array<double> pt(d); for (int i=0; i<d; i++) pt[i] = 0.25; double up = u.evaluate(pt); double vp = simple_function(up); double vp2 = v.evaluate(pt); // Print results std::cout << "\tv mean = " << mean << std::endl; std::cout << "\tv std. dev. = " << std_dev << std::endl; std::cout << "\tv(0.25) (true) = " << vp << std::endl; std::cout << "\tv(0.25) (pce) = " << vp2 << std::endl; // Check the answer if (std::abs(vp - vp2) < 1e-2) std::cout << "\nExample Passed!" << std::endl; } catch (std::exception& e) { std::cout << e.what() << std::endl; } }
int main(int argc, char **argv) { try { // Initialize MPI #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example generates the sparsity pattern for the block stochastic Galerkin matrix.\n"); int d = 3; CLP.setOption("dimension", &d, "Stochastic dimension"); int p = 5; CLP.setOption("order", &p, "Polynomial order"); double drop = 1.0e-12; CLP.setOption("drop", &drop, "Drop tolerance"); std::string file = "A.mm"; CLP.setOption("filename", &file, "Matrix Market filename"); BasisType basis_type = LEGENDRE; CLP.setOption("basis", &basis_type, num_basis_types, basis_type_values, basis_type_names, "Basis type"); Stokhos::GrowthPolicy growth_type = Stokhos::SLOW_GROWTH; CLP.setOption("growth", &growth_type, num_growth_types, growth_type_values, growth_type_names, "Growth type"); ProductBasisType prod_basis_type = COMPLETE; CLP.setOption("product_basis", &prod_basis_type, num_prod_basis_types, prod_basis_type_values, prod_basis_type_names, "Product basis type"); double alpha = 1.0; CLP.setOption("alpha", &alpha, "Jacobi alpha index"); double beta = 1.0; CLP.setOption("beta", &beta, "Jacobi beta index"); bool full = true; CLP.setOption("full", "linear", &full, "Use full or linear expansion"); bool use_old = false; CLP.setOption("old", "new", &use_old, "Use old or new Cijk algorithm"); int tile_size = 100; CLP.setOption("tile_size", &tile_size, "Tile size"); // Parse arguments CLP.parse( argc, argv ); // Basis Array< RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); for (int i=0; i<d; i++) { if (basis_type == HERMITE) bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>( p, true, growth_type)); else if (basis_type == LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>( p, true, growth_type)); else if (basis_type == CC_LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::ClenshawCurtisLegendreBasis<int,double>( p, true)); else if (basis_type == GP_LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::GaussPattersonLegendreBasis<int,double>( p, true)); else if (basis_type == RYS) bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>( p, 1.0, true, growth_type)); else if (basis_type == JACOBI) bases[i] = Teuchos::rcp(new Stokhos::JacobiBasis<int,double>( p, alpha, beta, true, growth_type)); } RCP<const Stokhos::ProductBasis<int,double> > basis; if (prod_basis_type == COMPLETE) basis = Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>( bases, drop, use_old)); else if (prod_basis_type == TENSOR) basis = Teuchos::rcp(new Stokhos::TensorProductBasis<int,double>( bases, drop)); else if (prod_basis_type == TOTAL) basis = Teuchos::rcp(new Stokhos::TotalOrderBasis<int,double>( bases, drop)); else if (prod_basis_type == SMOLYAK) { Stokhos::TotalOrderIndexSet<int> index_set(d, p); basis = Teuchos::rcp(new Stokhos::SmolyakBasis<int,double>( bases, index_set, drop)); } // Triple product tensor typedef Stokhos::Sparse3Tensor<int,double> Cijk_type; RCP<Cijk_type> Cijk; if (full) Cijk = basis->computeTripleProductTensor(); else Cijk = basis->computeLinearTripleProductTensor(); int sz = basis->size(); std::cout << "basis size = " << sz << " num nonzero Cijk entries = " << Cijk->num_entries() << std::endl; // Setup tiles if (tile_size > sz) tile_size = sz; int j_sz = sz; int k_sz = sz; if (!full) k_sz = basis->dimension()+1; int nj_tiles = j_sz / tile_size; int nk_tiles = k_sz / tile_size; if (j_sz - nj_tiles*tile_size > 0) ++nj_tiles; if (k_sz - nk_tiles*tile_size > 0) ++nk_tiles; Array<CijkNonzeros> nz(sz); for (int i=0; i<sz; ++i) { nz[i].i = i; nz[i].nz_tiles.resize(nj_tiles); for (int j=0; j<nj_tiles; ++j) nz[i].nz_tiles[j].resize(nk_tiles); } // Get number of nonzeros in Cijk for each i Cijk_type::k_iterator k_begin = Cijk->k_begin(); Cijk_type::k_iterator k_end = Cijk->k_end(); for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { int k = index(k_it); int k_tile = k / tile_size; Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk->j_end(k_it); for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); int j_tile = j / tile_size; Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it); Cijk_type::kji_iterator i_end = Cijk->i_end(j_it); for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it) { int i = index(i_it); ++nz[i].total_nz; ++nz[i].nz_tiles[j_tile][k_tile]; } } } // Sort based on total number of nonzeros std::sort(nz.begin(), nz.end(), NZCompare()); // Print nonzeros int w_index = 3; int w_nz = 5; int w_tile = 4; for (int i=0; i<nz.size(); ++i) { int idx = nz[i].i; std::cout << std::setw(w_index) << idx << " " << basis->term(idx) << ": " << std::setw(w_nz) << nz[i].total_nz << ", "; for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) std::cout << std::setw(w_tile) << nz[i].nz_tiles[j][k] << " "; std::cout << std::endl; } // Add up the nonzeros for each (j,k) tile Array< Array<int> > total_nz_tiles(nj_tiles); int total_nz = 0; for (int j=0; j<nj_tiles; ++j) total_nz_tiles[j].resize(nk_tiles); for (int i=0; i<nz.size(); ++i) { total_nz += nz[i].total_nz; for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) total_nz_tiles[j][k] += nz[i].nz_tiles[j][k]; } int w_total = (w_index+1) + (2*basis->dimension()+5) + w_nz; std::cout << std::endl << std::setw(w_total) << total_nz << ", "; for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) std::cout << std::setw(w_tile) << total_nz_tiles[j][k] << " "; std::cout << std::endl; // Now partition Cijk for each tile Array< Array< RCP<Cijk_type> > > Cijk_tile(nj_tiles); for (int j=0; j<nj_tiles; ++j) { Cijk_tile[j].resize(nk_tiles); for (int k=0; k<nk_tiles; ++k) Cijk_tile[j][k] = rcp(new Cijk_type); } for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { int k = index(k_it); int k_tile = k / tile_size; Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk->j_end(k_it); for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { int j = index(j_it); int j_tile = j / tile_size; Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it); Cijk_type::kji_iterator i_end = Cijk->i_end(j_it); for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it) { int i = index(i_it); double c = value(i_it); Cijk_tile[j_tile][k_tile]->add_term(i,j,k,c); } } } for (int j=0; j<nj_tiles; ++j) for (int k=0; k<nk_tiles; ++k) Cijk_tile[j][k]->fillComplete(); Array< Array< std::map<int,int> > > nz_tile(nj_tiles); Array< Array< Array< std::pair<int,int> > > > sorted_nz_tile(nj_tiles); for (int j_tile=0; j_tile<nj_tiles; ++j_tile) { nz_tile[j_tile].resize(nk_tiles); sorted_nz_tile[j_tile].resize(nk_tiles); for (int k_tile=0; k_tile<nk_tiles; ++k_tile) { // Count nonzeros for each i, for each tile Cijk_type::k_iterator k_begin = Cijk_tile[j_tile][k_tile]->k_begin(); Cijk_type::k_iterator k_end = Cijk_tile[j_tile][k_tile]->k_end(); for (Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) { //int k = index(k_it); Cijk_type::kj_iterator j_begin = Cijk_tile[j_tile][k_tile]->j_begin(k_it); Cijk_type::kj_iterator j_end = Cijk_tile[j_tile][k_tile]->j_end(k_it); for (Cijk_type::kj_iterator j_it = j_begin; j_it != j_end; ++j_it) { //int j = index(j_it); Cijk_type::kji_iterator i_begin = Cijk_tile[j_tile][k_tile]->i_begin(j_it); Cijk_type::kji_iterator i_end = Cijk_tile[j_tile][k_tile]->i_end(j_it); for (Cijk_type::kji_iterator i_it = i_begin; i_it != i_end; ++i_it){ int i = index(i_it); if (nz_tile[j_tile][k_tile].count(i) == 0) nz_tile[j_tile][k_tile][i] = 1; else ++(nz_tile[j_tile][k_tile][i]); } } } // Sort based on non-zeros for each i, for each tile sorted_nz_tile[j_tile][k_tile].resize(nz_tile[j_tile][k_tile].size()); int idx=0; for (std::map<int,int>::iterator it = nz_tile[j_tile][k_tile].begin(); it != nz_tile[j_tile][k_tile].end(); ++it) { sorted_nz_tile[j_tile][k_tile][idx] = std::make_pair(it->first, it->second); ++idx; } std::sort( sorted_nz_tile[j_tile][k_tile].begin(), sorted_nz_tile[j_tile][k_tile].end(), NZPairCompare() ); // Print number of non-zeros for each i, for each tile std::cout << std::endl << "Tile (" << j_tile << ", " << k_tile << "):" << std::endl; for (int i=0; i<sorted_nz_tile[j_tile][k_tile].size(); ++i) { int idx = sorted_nz_tile[j_tile][k_tile][i].first; std::cout << std::setw(w_index) << idx << " " << basis->term(idx) << ": " << std::setw(w_nz) << sorted_nz_tile[j_tile][k_tile][i].second << std::endl; if (i % 32 == 31) std::cout << std::endl; } } } } catch (std::exception& e) { std::cout << e.what() << std::endl; } return 0; }
int main(int argc, char **argv) { int num_k; try { // Initialize MPI #ifdef HAVE_MPI MPI_Init(&argc,&argv); #endif // Setup command line options Teuchos::CommandLineProcessor CLP; CLP.setDocString( "This example generates the sparsity pattern for the block stochastic Galerkin matrix.\n"); int d = 3; CLP.setOption("dimension", &d, "Stochastic dimension"); int p = 5; CLP.setOption("order", &p, "Polynomial order"); double drop = 1.0e-15; CLP.setOption("drop", &drop, "Drop tolerance"); std::string file_base = "A"; CLP.setOption("base filename", &file_base, "Base filename for matrix market files"); BasisType basis_type = LEGENDRE; CLP.setOption("basis", &basis_type, num_basis_types, basis_type_values, basis_type_names, "Basis type"); bool full = true; CLP.setOption("full", "linear", &full, "Use full or linear expansion"); bool use_old = false; CLP.setOption("old", "new", &use_old, "Use old or new Cijk algorithm"); // Parse arguments CLP.parse( argc, argv ); // Basis Teuchos::Array< Teuchos::RCP<const Stokhos::OneDOrthogPolyBasis<int,double> > > bases(d); for (int i=0; i<d; i++) { if (basis_type == HERMITE) bases[i] = Teuchos::rcp(new Stokhos::HermiteBasis<int,double>(p)); else if (basis_type == LEGENDRE) bases[i] = Teuchos::rcp(new Stokhos::LegendreBasis<int,double>(p)); else if (basis_type == RYS) bases[i] = Teuchos::rcp(new Stokhos::RysBasis<int,double>(p, 1.0, false)); } Teuchos::RCP<const Stokhos::CompletePolynomialBasis<int,double> > basis = Teuchos::rcp(new Stokhos::CompletePolynomialBasis<int,double>(bases, drop, use_old)); // Triple product tensor Teuchos::RCP<Stokhos::Sparse3Tensor<int,double> > Cijk; if (full) num_k = basis->size(); else num_k = basis->dimension()+1; Cijk = basis->computeTripleProductTensor(num_k); std::cout << "basis size = " << basis->size() << " num nonzero Cijk entries = " << Cijk->num_entries() << std::endl; #ifdef HAVE_MPI Epetra_MpiComm comm(MPI_COMM_WORLD); #else Epetra_SerialComm comm; #endif // Number of stochastic rows int num_rows = basis->size(); // Replicated local map Epetra_LocalMap map(num_rows, 0, comm); // Loop over Cijk entries including a non-zero in the graph at // indices (i,j) if Cijk is non-zero for each k typedef Stokhos::Sparse3Tensor<int,double> Cijk_type; double one = 1.0; for (Cijk_type::k_iterator k_it=Cijk->k_begin(); k_it!=Cijk->k_end(); ++k_it) { int k = index(k_it); Epetra_CrsMatrix mat(Copy, map, 1); for (Cijk_type::kj_iterator j_it = Cijk->j_begin(k_it); j_it != Cijk->j_end(k_it); ++j_it) { int j = index(j_it); for (Cijk_type::kji_iterator i_it = Cijk->i_begin(j_it); i_it != Cijk->i_end(j_it); ++i_it) { int i = index(i_it); mat.InsertGlobalValues(i, 1, &one, &j); } } mat.FillComplete(); // Construct file name std::stringstream ss; ss << file_base << "_" << k << ".mm"; std::string file = ss.str(); // Save matrix to file EpetraExt::RowMatrixToMatrixMarketFile(file.c_str(), mat); } } catch (std::exception& e) { std::cout << e.what() << std::endl; } return num_k; }
int main(int narg, char *arg[]) { Teuchos::GlobalMPISession mpiSession(&narg, &arg,0); Platform &platform = Tpetra::DefaultPlatform::getDefaultPlatform(); RCP<const Teuchos::Comm<int> > CommT = platform.getComm(); int me = CommT->getRank(); //int numProcs = CommT->getSize(); if (me == 0){ cout << "====================================================================\n" << "| |\n" << "| Example: Partition APF Mesh |\n" << "| |\n" << "| Questions? Contact Karen Devine ([email protected]), |\n" << "| Erik Boman ([email protected]), |\n" << "| Siva Rajamanickam ([email protected]). |\n" << "| |\n" << "| Pamgen's website: http://trilinos.sandia.gov/packages/pamgen |\n" << "| Zoltan2's website: http://trilinos.sandia.gov/packages/zoltan2 |\n" << "| Trilinos website: http://trilinos.sandia.gov |\n" << "| |\n" << "====================================================================\n"; } #ifdef HAVE_MPI if (me == 0) { cout << "PARALLEL executable \n"; } #else if (me == 0) { cout << "SERIAL executable \n"; } #endif /***************************************************************************/ /******************************* GET INPUTS ********************************/ /***************************************************************************/ // default values for command-line arguments std::string meshFileName("4/"); std::string modelFileName("torus.dmg"); std::string action("zoltan_hg"); std::string parma_method("VtxElm"); std::string output_loc(""); int nParts = CommT->getSize(); double imbalance=1.1; // Read run-time options. Teuchos::CommandLineProcessor cmdp (false, false); cmdp.setOption("meshfile", &meshFileName, "Mesh file with APF specifications (.smb file(s))"); cmdp.setOption("modelfile", &modelFileName, "Model file with APF specifications (.dmg file)"); cmdp.setOption("action", &action, "Method to use: mj, scotch, zoltan_rcb, parma or color"); cmdp.setOption("parma_method", &parma_method, "Method to use: Vertex, Edge, Element, VtxElm, VtxEdgeElm, ElmLtVtx, Ghost, or Shape "); cmdp.setOption("nparts", &nParts, "Number of parts to create"); cmdp.setOption("imbalance", &imbalance, "Target Imbalance for first partitioner"); cmdp.setOption("output", &output_loc, "Location of new partitioned apf mesh. Ex: 4/torus.smb"); cmdp.parse(narg, arg); /***************************************************************************/ /********************** GET CELL TOPOLOGY **********************************/ /***************************************************************************/ // Get dimensions //int dim = 3; /***************************************************************************/ /***************************** GENERATE MESH *******************************/ /***************************************************************************/ #ifdef HAVE_ZOLTAN2_PARMA if (me == 0) cout << "Generating mesh ... \n\n"; //Setup for SCOREC PCU_Comm_Init(); // Generate mesh with MDS gmi_register_mesh(); apf::Mesh2* m = apf::loadMdsMesh(modelFileName.c_str(),meshFileName.c_str()); runTest(CommT,m,action,parma_method,nParts,imbalance,"partition"); runTest(CommT,m,"parma",parma_method,nParts,imbalance,"parma"); if (output_loc!="") { m->writeNative(output_loc.c_str()); } // delete mesh if (me == 0) cout << "Deleting the mesh ... \n\n"; //Delete APF Mesh; m->destroyNative(); apf::destroyMesh(m); //End communications PCU_Comm_Free(); #endif if (me == 0) std::cout << "PASS" << std::endl; return 0; }
int main_(Teuchos::CommandLineProcessor &clp, int argc, char *argv[]) { #include <MueLu_UseShortNames.hpp> using Teuchos::RCP; using Teuchos::rcp; using Teuchos::ArrayRCP; using Teuchos::RCP; using Teuchos::TimeMonitor; // ========================================================================= // MPI initialization using Teuchos // ========================================================================= Teuchos::GlobalMPISession mpiSession(&argc, &argv, NULL); RCP<const Teuchos::Comm<int> > comm = Teuchos::DefaultComm<int>::getComm(); // ========================================================================= // Convenient definitions // ========================================================================= typedef Teuchos::ScalarTraits<SC> STS; SC one = STS::one(), zero = STS::zero(); RCP<Teuchos::FancyOStream> fancy = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout)); Teuchos::FancyOStream& out = *fancy; out.setOutputToRootOnly(0); // ========================================================================= // Parameters initialization // ========================================================================= GO nx = 100, ny = 100, nz = 100; Galeri::Xpetra::Parameters<GO> galeriParameters(clp, nx, ny, nz, "Laplace2D"); // manage parameters of the test case Xpetra::Parameters xpetraParameters(clp); // manage parameters of Xpetra std::string xmlFileName = ""; clp.setOption("xml", &xmlFileName, "read parameters from a file"); int numRebuilds = 0; clp.setOption("rebuild", &numRebuilds, "#times to rebuild hierarchy"); bool useFilter = true; clp.setOption("filter", "nofilter", &useFilter, "Print out only Setup times"); bool modify = true; clp.setOption("modify", "nomodify", &modify, "Change values of the matrix used for reuse"); clp.recogniseAllOptions(true); switch (clp.parse(argc, argv)) { case Teuchos::CommandLineProcessor::PARSE_HELP_PRINTED: return EXIT_SUCCESS; case Teuchos::CommandLineProcessor::PARSE_ERROR: case Teuchos::CommandLineProcessor::PARSE_UNRECOGNIZED_OPTION: return EXIT_FAILURE; case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: break; } Xpetra::UnderlyingLib lib = xpetraParameters.GetLib(); ParameterList paramList; paramList.set("verbosity", "none"); if (xmlFileName != "") Teuchos::updateParametersFromXmlFileAndBroadcast(xmlFileName, Teuchos::Ptr<ParameterList>(¶mList), *comm); // Retrieve matrix parameters (they may have been changed on the command line) // [for instance, if we changed matrix type from 2D to 3D we need to update nz] ParameterList galeriList = galeriParameters.GetParameterList(); // ========================================================================= // Problem construction // ========================================================================= // For comments, see Driver.cpp out << "========================================================\n" << xpetraParameters << galeriParameters; std::string matrixType = galeriParameters.GetMatrixType(); RCP<Matrix> A, B; RCP<const Map> map; RCP<MultiVector> coordinates, nullspace; ConstructData(matrixType, galeriList, lib, comm, A, map, coordinates, nullspace); if (modify) { galeriList.set("stretchx", 2.2); galeriList.set("stretchy", 1.2); galeriList.set("stretchz", 0.3); } ConstructData(matrixType, galeriList, lib, comm, B, map, coordinates, nullspace); out << "Processor subdomains in x direction: " << galeriList.get<GO>("mx") << std::endl << "Processor subdomains in y direction: " << galeriList.get<GO>("my") << std::endl << "Processor subdomains in z direction: " << galeriList.get<GO>("mz") << std::endl << "========================================================" << std::endl; // ========================================================================= // Setups and solves // ========================================================================= RCP<Vector> X = VectorFactory::Build(map); RCP<Vector> Y = VectorFactory::Build(map); Y->setSeed(846930886); Y->randomize(); const int nIts = 9; std::string thickSeparator = "============================================================="; std::string thinSeparator = "-------------------------------------------------------------"; // ========================================================================= // Setup #1 (no reuse) // ========================================================================= out << thickSeparator << " no reuse " << thickSeparator << std::endl; { RCP<Hierarchy> H; // Run multiple builds for matrix A and time them RCP<Teuchos::Time> tm = TimeMonitor::getNewTimer("Setup #1: no reuse"); for (int i = 0; i <= numRebuilds; i++) { out << thinSeparator << " no reuse (rebuild #" << i << ") " << thinSeparator << std::endl; // Start timing (skip first build to reduce jitter) if (!(numRebuilds && i == 0)) tm->start(); A->SetMaxEigenvalueEstimate(-one); H = CreateHierarchy(A, paramList, coordinates); // Stop timing if (!(numRebuilds && i == 0)) { tm->stop(); tm->incrementNumCalls(); } } X->putScalar(zero); H->Iterate(*Y, *X, nIts); out << "residual(A) = " << Utilities::ResidualNorm(*A, *X, *Y)[0] << " [no reuse]" << std::endl; // Run a build for matrix B to record its convergence B->SetMaxEigenvalueEstimate(-one); H = CreateHierarchy(B, paramList, coordinates); X->putScalar(zero); H->Iterate(*Y, *X, nIts); out << "residual(B) = " << Utilities::ResidualNorm(*B, *X, *Y)[0] << " [no reuse]" << std::endl; } // ========================================================================= // Setup #2-inf (reuse) // ========================================================================= std::vector<std::string> reuseTypes, reuseNames; reuseTypes.push_back("S"); reuseNames.push_back("smoothers"); reuseTypes.push_back("tP"); reuseNames.push_back("tentative P"); reuseTypes.push_back("RP"); reuseNames.push_back("smoothed P and R"); for (size_t k = 0; k < reuseTypes.size(); k++) { out << thickSeparator << " " << reuseTypes[k] << " " << thickSeparator << std::endl; A->SetMaxEigenvalueEstimate(-one); paramList.set("reuse: type", reuseTypes[k]); out << thinSeparator << " " << reuseTypes[k] << " (initial) " << thinSeparator << std::endl; RCP<Hierarchy> H = CreateHierarchy(A, paramList, coordinates); X->putScalar(zero); H->Iterate(*Y, *X, nIts); out << "residual(A) = " << Utilities::ResidualNorm(*A, *X, *Y)[0] << " [reuse \"" << reuseNames[k] << "\"]" << std::endl; // Reuse setup RCP<Matrix> Bcopy = Xpetra::MatrixFactory2<Scalar, LocalOrdinal, GlobalOrdinal, Node>::BuildCopy(B); RCP<Teuchos::Time> tm = TimeMonitor::getNewTimer("Setup #" + MueLu::toString(k+2) + ": reuse " + reuseNames[k]); for (int i = 0; i <= numRebuilds; i++) { out << thinSeparator << " " << reuseTypes[k] << " (rebuild #" << i << ") " << thinSeparator << std::endl; // Start timing (skip first build to reduce jitter) if (!(numRebuilds && i == 0)) tm->start(); B->SetMaxEigenvalueEstimate(-one); ReuseHierarchy(B, *H); // Stop timing if (!(numRebuilds && i == 0)) { tm->stop(); tm->incrementNumCalls(); } X->putScalar(zero); H->Iterate(*Y, *X, nIts); out << "residual(B) = " << Utilities::ResidualNorm(*B, *X, *Y)[0] << " [reuse \"" << reuseNames[k] << "\"]" << std::endl; // Change the pointers so that reuse is not a no-op B.swap(Bcopy); } } out << thickSeparator << thickSeparator << std::endl; { const bool alwaysWriteLocal = true; const bool writeGlobalStats = true; const bool writeZeroTimers = false; const bool ignoreZeroTimers = true; const std::string filter = (useFilter ? "Setup #" : ""); TimeMonitor::summarize(A->getRowMap()->getComm().ptr(), std::cout, alwaysWriteLocal, writeGlobalStats, writeZeroTimers, Teuchos::Union, filter, ignoreZeroTimers); } return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { int ierr = 0; int p = 1; int w = p+7; int w_name = 13; try { // Set up command line options Teuchos::CommandLineProcessor clp; clp.setDocString("This program tests the speed of various forward mode AD implementations for a finite-element-like Jacobian fill"); int work_count = 200000; int num_eqns_begin = 5; int num_eqns_end = 65; int num_eqns_delta = 10; int rt = 0; clp.setOption("wc", &work_count, "Work count = num_nodes*num_eqns"); clp.setOption("p_begin", &num_eqns_begin, "Intitial number of equations"); clp.setOption("p_end", &num_eqns_end, "Final number of equations"); clp.setOption("p_delta", &num_eqns_delta, "Step in number of equations"); clp.setOption("rt", &rt, "Include ADOL-C retaping test"); // Parse options Teuchos::CommandLineProcessor::EParseCommandLineReturn parseReturn= clp.parse(argc, argv); if(parseReturn != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) return 1; // Print header std::cout.setf(std::ios::right); std::cout << std::setw(w_name) << "Name" << " "; for (int num_eqns = num_eqns_begin; num_eqns <= num_eqns_end; num_eqns += num_eqns_delta) std::cout << std::setw(w) << num_eqns << " "; std::cout << std::endl; for (int j=0; j<w_name; j++) std::cout << '='; std::cout << " "; for (int num_eqns = num_eqns_begin; num_eqns <= num_eqns_end; num_eqns += num_eqns_delta) { for (int j=0; j<w; j++) std::cout << '='; std::cout << " "; } std::cout << std::endl; // Analytic std::vector<double> times_analytic = do_times(work_count, num_eqns_begin, num_eqns_end, num_eqns_delta, analytic_jac_fill); print_times(times_analytic, times_analytic, "Analytic", p, w, w_name); #ifdef HAVE_ADIC // Note there seems to be a bug in ADIC where doing more than one num_eqns // value results in incorrect timings after the first. Doing one value // at a time seems to give correct values though. std::vector<double> times_adic = do_times(work_count, num_eqns_begin, num_eqns_end, num_eqns_delta, adic_jac_fill); print_times(times_adic, times_analytic, "ADIC", p, w, w_name); #endif // Original Fad std::vector<double> times_sfad = do_times_sfad<Sacado::Fad::SFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_sfad, times_analytic, "SFAD", p, w, w_name); std::vector<double> times_slfad = do_times_sfad<Sacado::Fad::SLFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_slfad, times_analytic, "SLFAD", p, w, w_name); std::vector<double> times_dfad = do_times_fad<Sacado::Fad::DFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_dfad, times_analytic, "DFAD", p, w, w_name); // ELR Fad std::vector<double> times_elr_sfad = do_times_sfad<Sacado::ELRFad::SFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_elr_sfad, times_analytic, "ELRSFAD", p, w, w_name); std::vector<double> times_elr_slfad = do_times_sfad<Sacado::ELRFad::SLFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_elr_slfad, times_analytic, "ELRSLFAD", p, w, w_name); std::vector<double> times_elr_dfad = do_times_fad<Sacado::ELRFad::DFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_elr_dfad, times_analytic, "ELRDFAD", p, w, w_name); // Cache Fad std::vector<double> times_cache_sfad = do_times_sfad<Sacado::CacheFad::SFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_cache_sfad, times_analytic, "CacheSFAD", p, w, w_name); std::vector<double> times_cache_slfad = do_times_sfad<Sacado::CacheFad::SLFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_cache_slfad, times_analytic, "CacheSLFAD", p, w, w_name); std::vector<double> times_cache_dfad = do_times_fad<Sacado::CacheFad::DFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_cache_dfad, times_analytic, "CacheDFAD", p, w, w_name); // ELR Cache Fad std::vector<double> times_cache_elr_sfad = do_times_sfad<Sacado::ELRCacheFad::SFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_cache_elr_sfad, times_analytic, "ELRCacheSFAD", p, w, w_name); std::vector<double> times_cache_elr_slfad = do_times_sfad<Sacado::ELRCacheFad::SLFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_cache_elr_slfad, times_analytic, "ELRCacheSLFAD", p, w, w_name); std::vector<double> times_cache_elr_dfad = do_times_fad<Sacado::ELRCacheFad::DFad>( work_count, num_eqns_begin, num_eqns_end, num_eqns_delta); print_times(times_cache_elr_dfad, times_analytic, "ELRCacheDFAD", p, w, w_name); } catch (std::exception& e) { std::cout << e.what() << std::endl; ierr = 1; } catch (const char *s) { std::cout << s << std::endl; ierr = 1; } catch (...) { std::cout << "Caught unknown exception!" << std::endl; ierr = 1; } return ierr; }