int main(int argc, char *argv[]) { #ifdef MINIAERO_FPMATH_CHECK _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); #endif int num_procs, my_id; #if WITH_MPI MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &num_procs); MPI_Comm_rank(MPI_COMM_WORLD, &my_id); double startTime = 0.0, endTime = 0.0; startTime = MPI_Wtime(); #else time_t startTime=0, endTime=0; time(&startTime); num_procs=1; my_id=0; #endif Options simulation_options; simulation_options.read_options_file(); Kokkos::initialize(argc,argv); run(simulation_options); #if WITH_MPI endTime = MPI_Wtime(); double elapsedTime = endTime-startTime; #else time(&endTime); double elapsedTime = difftime(endTime,startTime); #endif Kokkos::finalize(); #if WITH_MPI MPI_Finalize(); #endif if(my_id==0){ fprintf(stdout,"\n ... Total elapsed time: %8.2f seconds ...\n",elapsedTime); } return 0; }
/** * Toggle floating point exceptions -- courtesy of Cody Permann & MOOSE team */ void enableFPE(bool on) { #if !defined(LIBMESH_HAVE_FEENABLEEXCEPT) && defined(LIBMESH_HAVE_XMMINTRIN_H) && !defined(__SUNPRO_CC) static int flags = 0; #endif if (on) { #ifdef LIBMESH_HAVE_FEENABLEEXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID); #elif LIBMESH_HAVE_XMMINTRIN_H # ifndef __SUNPRO_CC flags = _MM_GET_EXCEPTION_MASK(); // store the flags _MM_SET_EXCEPTION_MASK(flags & ~_MM_MASK_INVALID); # endif #endif #if LIBMESH_HAVE_DECL_SIGACTION struct sigaction new_action, old_action; // Set up the structure to specify the new action. new_action.sa_sigaction = libmesh_handleFPE; sigemptyset (&new_action.sa_mask); new_action.sa_flags = SA_SIGINFO; sigaction (SIGFPE, nullptr, &old_action); if (old_action.sa_handler != SIG_IGN) sigaction (SIGFPE, &new_action, nullptr); #endif } else { #ifdef LIBMESH_HAVE_FEDISABLEEXCEPT fedisableexcept(FE_DIVBYZERO | FE_INVALID); #elif LIBMESH_HAVE_XMMINTRIN_H # ifndef __SUNPRO_CC _MM_SET_EXCEPTION_MASK(flags); # endif #endif signal(SIGFPE, SIG_DFL); } }
int main(int argc, char *argv[]) { #ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS cout << "NOTE: enabling floating point exceptions for divide by zero.\n"; _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); #endif Teuchos::GlobalMPISession mpiSession(&argc, &argv); int rank = Teuchos::GlobalMPISession::getRank(); #ifdef HAVE_MPI Epetra_MpiComm Comm(MPI_COMM_WORLD); //cout << "rank: " << rank << " of " << numProcs << endl; #else Epetra_SerialComm Comm; #endif Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to. Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options double minTol = 1e-8; bool use3D = false; int refCount = 10; int k = 4; // poly order for field variables int delta_k = use3D ? 3 : 2; // test space enrichment int k_coarse = 0; bool useMumps = true; bool useGMGSolver = true; bool enforceOneIrregularity = true; bool useStaticCondensation = false; bool conformingTraces = false; bool useDiagonalScaling = false; // of the global stiffness matrix in GMGSolver bool printRefinementDetails = false; bool useWeightedGraphNorm = true; // graph norm scaled according to units, more or less int numCells = 2; int AztecOutputLevel = 1; int gmgMaxIterations = 10000; int smootherOverlap = 0; double relativeTol = 1e-6; double D = 1.0; // characteristic length scale cmdp.setOption("polyOrder",&k,"polynomial order for field variable u"); cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment"); cmdp.setOption("k_coarse", &k_coarse, "polynomial order for field variables on coarse mesh"); cmdp.setOption("numRefs",&refCount,"number of refinements"); cmdp.setOption("D", &D, "domain dimension"); cmdp.setOption("useConformingTraces", "useNonConformingTraces", &conformingTraces); cmdp.setOption("enforceOneIrregularity", "dontEnforceOneIrregularity", &enforceOneIrregularity); cmdp.setOption("smootherOverlap", &smootherOverlap, "overlap for smoother"); cmdp.setOption("printRefinementDetails", "dontPrintRefinementDetails", &printRefinementDetails); cmdp.setOption("azOutput", &AztecOutputLevel, "Aztec output level"); cmdp.setOption("numCells", &numCells, "number of cells in the initial mesh"); cmdp.setOption("useScaledGraphNorm", "dontUseScaledGraphNorm", &useWeightedGraphNorm); // cmdp.setOption("gmgTol", &gmgTolerance, "tolerance for GMG convergence"); cmdp.setOption("relativeTol", &relativeTol, "Energy error-relative tolerance for iterative solver."); cmdp.setOption("gmgMaxIterations", &gmgMaxIterations, "tolerance for GMG convergence"); bool enhanceUField = false; cmdp.setOption("enhanceUField", "dontEnhanceUField", &enhanceUField); cmdp.setOption("useStaticCondensation", "dontUseStaticCondensation", &useStaticCondensation); if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) { #ifdef HAVE_MPI MPI_Finalize(); #endif return -1; } double width = D, height = D, depth = D; VarFactory varFactory; // fields: VarPtr u = varFactory.fieldVar("u", L2); VarPtr sigma = varFactory.fieldVar("\\sigma", VECTOR_L2); FunctionPtr n = Function::normal(); // traces: VarPtr u_hat; if (conformingTraces) { u_hat = varFactory.traceVar("\\widehat{u}", u); } else { cout << "Note: using non-conforming traces.\n"; u_hat = varFactory.traceVar("\\widehat{u}", u, L2); } VarPtr sigma_n_hat = varFactory.fluxVar("\\widehat{\\sigma}_{n}", sigma * n); // test functions: VarPtr tau = varFactory.testVar("\\tau", HDIV); VarPtr v = varFactory.testVar("v", HGRAD); BFPtr poissonBF = Teuchos::rcp( new BF(varFactory) ); FunctionPtr alpha = Function::constant(1); // viscosity // tau terms: poissonBF->addTerm(sigma / alpha, tau); poissonBF->addTerm(-u, tau->div()); // (sigma1, tau1) poissonBF->addTerm(u_hat, tau * n); // v terms: poissonBF->addTerm(- sigma, v->grad()); // (mu sigma1, grad v1) poissonBF->addTerm( sigma_n_hat, v); int horizontalCells = numCells, verticalCells = numCells, depthCells = numCells; vector<double> domainDimensions; domainDimensions.push_back(width); domainDimensions.push_back(height); vector<int> elementCounts; elementCounts.push_back(horizontalCells); elementCounts.push_back(verticalCells); if (use3D) { domainDimensions.push_back(depth); elementCounts.push_back(depthCells); } MeshPtr mesh, k0Mesh; int H1Order = k + 1; int H1Order_coarse = k_coarse + 1; if (!use3D) { Teuchos::ParameterList pl; map<int,int> trialOrderEnhancements; if (enhanceUField) { trialOrderEnhancements[u->ID()] = 1; } BFPtr poissonBilinearForm = poissonBF; pl.set("useMinRule", true); pl.set("bf",poissonBilinearForm); pl.set("H1Order", H1Order); pl.set("delta_k", delta_k); pl.set("horizontalElements", horizontalCells); pl.set("verticalElements", verticalCells); pl.set("divideIntoTriangles", false); pl.set("useConformingTraces", conformingTraces); pl.set("trialOrderEnhancements", &trialOrderEnhancements); pl.set("x0",(double)0); pl.set("y0",(double)0); pl.set("width", width); pl.set("height",height); mesh = MeshFactory::quadMesh(pl); pl.set("H1Order", H1Order_coarse); k0Mesh = MeshFactory::quadMesh(pl); } else { mesh = MeshFactory::rectilinearMesh(poissonBF, domainDimensions, elementCounts, H1Order, delta_k); k0Mesh = MeshFactory::rectilinearMesh(poissonBF, domainDimensions, elementCounts, H1Order_coarse, delta_k); } mesh->registerObserver(k0Mesh); // ensure that the k0 mesh refinements track those of the solution mesh RHSPtr rhs = RHS::rhs(); // zero FunctionPtr sin_pi_x = Teuchos::rcp( new Sin_ax(PI/D) ); FunctionPtr sin_pi_y = Teuchos::rcp( new Sin_ay(PI/D) ); FunctionPtr u_exact = sin_pi_x * sin_pi_y; FunctionPtr f = -(2.0 * PI * PI / (D * D)) * sin_pi_x * sin_pi_y; rhs->addTerm( f * v ); BCPtr bc = BC::bc(); SpatialFilterPtr boundary = SpatialFilter::allSpace(); bc->addDirichlet(u_hat, boundary, u_exact); IPPtr graphNorm; FunctionPtr h = Teuchos::rcp( new hFunction() ); if (useWeightedGraphNorm) { graphNorm = IP::ip(); graphNorm->addTerm( tau->div() ); // u graphNorm->addTerm( (h / alpha) * tau - h * v->grad() ); // sigma graphNorm->addTerm( v ); // boundary term (adjoint to u) graphNorm->addTerm( h * tau ); // // new effort, with the idea that the test norm should be considered in reference space, basically // graphNorm = IP::ip(); // graphNorm->addTerm( tau->div() ); // u // graphNorm->addTerm( tau / h - v->grad() ); // sigma // graphNorm->addTerm( v / h ); // boundary term (adjoint to u) // graphNorm->addTerm( tau / h ); } else { map<int, double> trialWeights; // on the squared terms in the trial space norm trialWeights[u->ID()] = 1.0 / (D * D); trialWeights[sigma->ID()] = 1.0; graphNorm = poissonBF->graphNorm(trialWeights, 1.0); // 1.0: weight on the L^2 terms } SolutionPtr solution = Solution::solution(mesh, bc, rhs, graphNorm); solution->setUseCondensedSolve(useStaticCondensation); mesh->registerSolution(solution); // sign up for projection of old solution onto refined cells. double energyThreshold = 0.2; RefinementStrategy refinementStrategy( solution, energyThreshold ); refinementStrategy.setReportPerCellErrors(true); refinementStrategy.setEnforceOneIrregularity(enforceOneIrregularity); Teuchos::RCP<Solver> coarseSolver, fineSolver; if (useMumps) { #ifdef HAVE_AMESOS_MUMPS coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) ); #else cout << "useMumps=true, but MUMPS is not available!\n"; exit(0); #endif } else { coarseSolver = Teuchos::rcp( new KluSolver ); } GMGSolver* gmgSolver; if (useGMGSolver) { double tol = relativeTol; int maxIters = gmgMaxIterations; BCPtr zeroBCs = bc->copyImposingZero(); gmgSolver = new GMGSolver(zeroBCs, k0Mesh, graphNorm, mesh, solution->getDofInterpreter(), solution->getPartitionMap(), maxIters, tol, coarseSolver, useStaticCondensation); gmgSolver->setAztecOutput(AztecOutputLevel); gmgSolver->setUseConjugateGradient(true); gmgSolver->gmgOperator()->setSmootherType(GMGOperator::IFPACK_ADDITIVE_SCHWARZ); gmgSolver->gmgOperator()->setSmootherOverlap(smootherOverlap); fineSolver = Teuchos::rcp( gmgSolver ); } else { fineSolver = coarseSolver; } // if (rank==0) cout << "experimentally starting by solving with MUMPS on the fine mesh.\n"; // solution->solve( Teuchos::rcp( new MumpsSolver) ); solution->solve(fineSolver); #ifdef HAVE_EPETRAEXT_HDF5 ostringstream dir_name; dir_name << "poissonCavityFlow_k" << k; HDF5Exporter exporter(mesh,dir_name.str()); exporter.exportSolution(solution,varFactory,0); #endif #ifdef HAVE_AMESOS_MUMPS if (useMumps) coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) ); #endif solution->reportTimings(); if (useGMGSolver) gmgSolver->gmgOperator()->reportTimings(); for (int refIndex=0; refIndex < refCount; refIndex++) { double energyError = solution->energyErrorTotal(); GlobalIndexType numFluxDofs = mesh->numFluxDofs(); if (rank==0) { cout << "Before refinement " << refIndex << ", energy error = " << energyError; cout << " (using " << numFluxDofs << " trace degrees of freedom)." << endl; } bool printToConsole = printRefinementDetails && (rank==0); refinementStrategy.refine(printToConsole); if (useStaticCondensation) { CondensedDofInterpreter* condensedDofInterpreter = dynamic_cast<CondensedDofInterpreter*>(solution->getDofInterpreter().get()); if (condensedDofInterpreter != NULL) { condensedDofInterpreter->reinitialize(); } } GlobalIndexType fineDofs = mesh->globalDofCount(); GlobalIndexType coarseDofs = k0Mesh->globalDofCount(); if (rank==0) { cout << "After refinement, coarse mesh has " << k0Mesh->numActiveElements() << " elements and " << coarseDofs << " dofs.\n"; cout << " Fine mesh has " << mesh->numActiveElements() << " elements and " << fineDofs << " dofs.\n"; } if (!use3D) { ostringstream fineMeshLocation, coarseMeshLocation; fineMeshLocation << "poissonFineMesh_k" << k << "_ref" << refIndex; GnuPlotUtil::writeComputationalMeshSkeleton(fineMeshLocation.str(), mesh, true); // true: label cells coarseMeshLocation << "poissonCoarseMesh_k" << k << "_ref" << refIndex; GnuPlotUtil::writeComputationalMeshSkeleton(coarseMeshLocation.str(), k0Mesh, true); // true: label cells } if (useGMGSolver) // create fresh fineSolver now that the meshes have changed: { #ifdef HAVE_AMESOS_MUMPS if (useMumps) coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) ); #endif double tol = max(relativeTol * energyError, minTol); int maxIters = gmgMaxIterations; BCPtr zeroBCs = bc->copyImposingZero(); gmgSolver = new GMGSolver(zeroBCs, k0Mesh, graphNorm, mesh, solution->getDofInterpreter(), solution->getPartitionMap(), maxIters, tol, coarseSolver, useStaticCondensation); gmgSolver->setAztecOutput(AztecOutputLevel); gmgSolver->setUseDiagonalScaling(useDiagonalScaling); fineSolver = Teuchos::rcp( gmgSolver ); } solution->solve(fineSolver); solution->reportTimings(); if (useGMGSolver) gmgSolver->gmgOperator()->reportTimings(); #ifdef HAVE_EPETRAEXT_HDF5 exporter.exportSolution(solution,varFactory,refIndex+1); #endif } double energyErrorTotal = solution->energyErrorTotal(); GlobalIndexType numFluxDofs = mesh->numFluxDofs(); GlobalIndexType numGlobalDofs = mesh->numGlobalDofs(); if (rank==0) { cout << "Final mesh has " << mesh->numActiveElements() << " elements and " << numFluxDofs << " trace dofs ("; cout << numGlobalDofs << " total dofs, including fields).\n"; cout << "Final energy error: " << energyErrorTotal << endl; } #ifdef HAVE_EPETRAEXT_HDF5 exporter.exportSolution(solution,varFactory,0); #endif if (!use3D) { GnuPlotUtil::writeComputationalMeshSkeleton("poissonRefinedMesh", mesh, true); } coarseSolver = Teuchos::rcp((Solver*) NULL); // without this when useMumps = true and running on one rank, we see a crash on exit, which may have to do with MPI being finalized before coarseSolver is deleted. return 0; }
int main(int argc, char* argv[]) { // Parse CLI arguments. TCLAP::CmdLine cmd( "OpenGeoSys-6 software.\n" "Copyright (c) 2012-2018, OpenGeoSys Community " "(http://www.opengeosys.org) " "Distributed under a Modified BSD License. " "See accompanying file LICENSE.txt or " "http://www.opengeosys.org/project/license\n" "version: " + BaseLib::BuildInfo::git_describe + "\n" + "CMake arguments: " + BaseLib::BuildInfo::cmake_args, ' ', BaseLib::BuildInfo::git_describe); TCLAP::UnlabeledValueArg<std::string> project_arg( "project-file", "Path to the ogs6 project file.", true, "", "PROJECT FILE"); cmd.add(project_arg); TCLAP::ValueArg<std::string> outdir_arg("o", "output-directory", "the output directory to write to", false, "", "output directory"); cmd.add(outdir_arg); TCLAP::ValueArg<std::string> log_level_arg("l", "log-level", "the verbosity of logging " "messages: none, error, warn, " "info, debug, all", false, #ifdef NDEBUG "info", #else "all", #endif "log level"); cmd.add(log_level_arg); TCLAP::SwitchArg nonfatal_arg("", "config-warnings-nonfatal", "warnings from parsing the configuration " "file will not trigger program abortion"); cmd.add(nonfatal_arg); TCLAP::SwitchArg unbuffered_cout_arg("", "unbuffered-std-out", "use unbuffered standard output"); cmd.add(unbuffered_cout_arg); #ifndef _WIN32 // TODO: On windows floating point exceptions are not handled // currently TCLAP::SwitchArg enable_fpe_arg("", "enable-fpe", "enables floating point exceptions"); cmd.add(enable_fpe_arg); #endif // _WIN32 cmd.parse(argc, argv); // deactivate buffer for standard output if specified if (unbuffered_cout_arg.isSet()) std::cout.setf(std::ios::unitbuf); ApplicationsLib::LogogSetup logog_setup; logog_setup.setLevel(log_level_arg.getValue()); INFO("This is OpenGeoSys-6 version %s.", BaseLib::BuildInfo::git_describe.c_str()); #ifndef _WIN32 // On windows this command line option is not present. // Enable floating point exceptions if (enable_fpe_arg.isSet()) #ifdef __APPLE__ _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); #else feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); #endif // __APPLE__ #endif // _WIN32 #ifdef OGS_USE_PYTHON pybind11::scoped_interpreter guard = ApplicationsLib::setupEmbeddedPython(); (void)guard; #endif BaseLib::RunTime run_time; { auto const start_time = std::chrono::system_clock::now(); auto const time_str = BaseLib::formatDate(start_time); INFO("OGS started on %s.", time_str.c_str()); } auto ogs_status = EXIT_SUCCESS; try { bool solver_succeeded = false; { ApplicationsLib::LinearSolverLibrarySetup linear_solver_library_setup(argc, argv); #if defined(USE_PETSC) vtkSmartPointer<vtkMPIController> controller = vtkSmartPointer<vtkMPIController>::New(); controller->Initialize(&argc, &argv, 1); vtkMPIController::SetGlobalController(controller); logog_setup.setFormatter( std::make_unique<BaseLib::TemplateLogogFormatterSuppressedGCC< TOPIC_LEVEL_FLAG | TOPIC_FILE_NAME_FLAG | TOPIC_LINE_NUMBER_FLAG>>()); #endif run_time.start(); auto project_config = BaseLib::makeConfigTree( project_arg.getValue(), !nonfatal_arg.getValue(), "OpenGeoSysProject"); ProjectData project(*project_config, BaseLib::extractPath(project_arg.getValue()), outdir_arg.getValue()); #ifdef USE_INSITU auto isInsituConfigured = false; //! \ogs_file_param{prj__insitu} if (auto t = project_config->getConfigSubtreeOptional("insitu")) { InSituLib::Initialize( //! \ogs_file_param{prj__insitu__scripts} t->getConfigSubtree("scripts"), BaseLib::extractPath(project_arg.getValue())); isInsituConfigured = true; } #else project_config->ignoreConfigParameter("insitu"); #endif INFO("Initialize processes."); for (auto& p : project.getProcesses()) { p.second->initialize(); } // Check intermediately that config parsing went fine. project_config.checkAndInvalidate(); BaseLib::ConfigTree::assertNoSwallowedErrors(); BaseLib::ConfigTree::assertNoSwallowedErrors(); BaseLib::ConfigTree::assertNoSwallowedErrors(); INFO("Solve processes."); auto& time_loop = project.getTimeLoop(); solver_succeeded = time_loop.loop(); #ifdef USE_INSITU if (isInsituConfigured) InSituLib::Finalize(); #endif INFO("[time] Execution took %g s.", run_time.elapsed()); #if defined(USE_PETSC) controller->Finalize(1); #endif } // This nested scope ensures that everything that could possibly // possess a ConfigTree is destructed before the final check below is // done. BaseLib::ConfigTree::assertNoSwallowedErrors(); ogs_status = solver_succeeded ? EXIT_SUCCESS : EXIT_FAILURE; } catch (std::exception& e) { ERR(e.what()); ogs_status = EXIT_FAILURE; } { auto const end_time = std::chrono::system_clock::now(); auto const time_str = BaseLib::formatDate(end_time); INFO("OGS terminated on %s.", time_str.c_str()); } return ogs_status; }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
int main(int argc, char *argv[]) { int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0; int64_t n_missing = 0, n_tot = 0; gzFile fp; kstream_t *ks; kstring_t str = {0,0,0}; int8_t **C = 0; double **M, *X, min_maf = 0.0; char **names = 0; // _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO)); _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO)); while ((c = getopt(argc, argv, "m:")) >= 0) { if (c == 'm') min_maf = atof(optarg); } if (argc - optind == 0) { fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n"); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]); return 2; } ks = ks_init(fp); // read the matrix into C while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { int8_t *q; char *p, *name = str.s; int i; ++lineno; for (p = str.s; *p && *p != '\t' && *p != ' '; ++p); if (*p) { *p++ = 0; for (; *p && (*p == '\t' || *p == ' '); ++p); } if (*p == 0) { fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno); continue; } if (n_cols != 0) { if (n_cols != str.s + str.l - p) { fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno); continue; } } else n_cols = str.s + str.l - p; if (n_rows == m_rows) { m_rows = m_rows? m_rows<<1 : 16; C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*)); names = (char**)realloc(names, m_rows * sizeof(char*)); } names[n_rows] = strdup(name); q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_cols; ++i) { if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0'; else q[i] = -1, ++n_missing; max_hap = max_hap > q[i]? max_hap : q[i]; } n_tot += n_cols; } free(str.s); fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap); fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot); { // normalize the matrix into M int i, j, *sum, *cnt, n_dropped = 0; double *mu, *pp; sum = (int*)calloc(n_cols, sizeof(int)); cnt = (int*)calloc(n_cols, sizeof(int)); mu = (double*)calloc(n_cols, sizeof(double)); pp = (double*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; for (j = 0; j < n_cols; ++j) if (q[j] >= 0) sum[j] += q[j], ++cnt[j]; } for (j = 0; j < n_cols; ++j) { if (cnt[j] > 0) { mu[j] = (double)sum[j] / cnt[j]; pp[j] = mu[j] / max_hap; if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped; } else ++n_dropped; } fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped); M = (double**)calloc(n_rows, sizeof(double*)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; double *r; r = M[i] = (double*)calloc(n_cols, sizeof(double)); for (j = 0; j < n_cols; ++j) r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j])); } free(sum); free(cnt); free(mu); free(pp); for (i = 0; i < n_rows; ++i) free(C[i]); free(C); } { // multiplication int i, j, k; X = (double*)calloc(n_rows * n_rows, sizeof(double)); for (i = 0; i < n_rows; ++i) { double *zi = M[i]; for (j = 0; j <= i; ++j) { double t = 0., *zj = M[j]; for (k = 0; k < n_cols; ++k) t += zi[k] * zj[k]; X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols; } } for (i = 0; i < n_rows; ++i) free(M[i]); free(M); } { // print eigan vectors double *ev; int i, j; evsrt_t *evsrt; ev = (double*)calloc(n_rows, sizeof(double)); evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t)); n_eigen_symm(X, n_rows, ev); for (i = 0; i < n_rows; ++i) evsrt[i].ev = ev[i], evsrt[i].i = i; ks_introsort(ev, n_rows, evsrt); for (i = 0; i < n_rows; ++i) { printf("%s", names[i]); for (j = 0; j < n_rows; ++j) printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev); putchar('\n'); free(names[i]); } free(ev); free(evsrt); free(X); free(names); } ks_destroy(ks); gzclose(fp); return 0; }
void ProcessImage(std::vector<cv::Mat>& images) { #ifdef CHECK_NANS _MM_SET_EXCEPTION_MASK( _MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_DIV_ZERO)); #endif frame_count++; // if (poses.size() > 100) { // exit(EXIT_SUCCESS); // } Sophus::SE3d guess; // If this is a keyframe, set it as one on the tracker. prev_delta_t_ba = tracker.t_ba() * prev_t_ba.inverse(); if (is_prev_keyframe) { prev_t_ba = Sophus::SE3d(); } else { prev_t_ba = tracker.t_ba(); } // Add a pose to the poses array if (is_prev_keyframe) { std::shared_ptr<sdtrack::TrackerPose> new_pose(new sdtrack::TrackerPose); if (poses.size() > 0) { new_pose->t_wp = poses.back()->t_wp * last_t_ba.inverse(); } poses.push_back(new_pose); axes.push_back( std::unique_ptr<SceneGraph::GLAxis>(new SceneGraph::GLAxis(0.05))); gui_vars.scene_graph.AddChild(axes.back().get()); } guess = prev_delta_t_ba * prev_t_ba; if (guess.translation() == Eigen::Vector3d(0, 0, 0) && poses.size() > 1) { guess.translation() = Eigen::Vector3d(0, 0, 0.01); } tracker.AddImage(images, guess); tracker.EvaluateTrackResiduals(0, tracker.GetImagePyramid(), tracker.GetCurrentTracks()); if (!is_manual_mode) { tracker.OptimizeTracks(-1, optimize_landmarks, optimize_pose); tracker.PruneTracks(); } // Update the pose t_ab based on the result from the tracker. UpdateCurrentPose(); if (do_keyframing) { const double track_ratio = (double)tracker.num_successful_tracks() / (double)keyframe_tracks; const double total_trans = tracker.t_ba().translation().norm(); const double total_rot = tracker.t_ba().so3().log().norm(); bool keyframe_condition = track_ratio < 0.8 || total_trans > 0.2 || total_rot > 0.1; std::cerr << "\tRatio: " << track_ratio << " trans: " << total_trans << " rot: " << total_rot << std::endl; if (keyframe_tracks != 0) { if (keyframe_condition) { is_keyframe = true; } else { is_keyframe = false; } } // If this is a keyframe, set it as one on the tracker. prev_delta_t_ba = tracker.t_ba() * prev_t_ba.inverse(); if (is_keyframe) { tracker.AddKeyframe(); } is_prev_keyframe = is_keyframe; } else { tracker.AddKeyframe(); } std::cerr << "Num successful : " << tracker.num_successful_tracks() << " keyframe tracks: " << keyframe_tracks << std::endl; if (!is_manual_mode) { BaAndStartNewLandmarks(); } if (is_keyframe) { std::cerr << "KEYFRAME." << std::endl; keyframe_tracks = tracker.GetCurrentTracks().size(); std::cerr << "New keyframe tracks: " << keyframe_tracks << std::endl; } else { std::cerr << "NOT KEYFRAME." << std::endl; } current_tracks = &tracker.GetCurrentTracks(); #ifdef CHECK_NANS _MM_SET_EXCEPTION_MASK( _MM_GET_EXCEPTION_MASK() | (_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_DIV_ZERO)); #endif std::cerr << "FRAME : " << frame_count << " KEYFRAME: " << poses.size() << std::endl; }
void disable_fpexcept(void) { unsigned int bits; bits = _MM_MASK_INVALID | _MM_MASK_DIV_ZERO | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW; _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() | bits); }
int main(int argc, char *argv[]) { #ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS cout << "NOTE: enabling floating point exceptions for divide by zero.\n"; _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); #endif Teuchos::GlobalMPISession mpiSession(&argc, &argv); int rank = Teuchos::GlobalMPISession::getRank(); Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options bool useCondensedSolve = false; // condensed solve not yet compatible with minimum rule meshes int numGridPoints = 32; // in x,y -- idea is to keep the overall order of approximation constant int k = 4; // poly order for u double theta = 0.5; int numTimeSteps = 2000; int numCells = -1; // in x, y (-1 so we can set a default if unset from the command line.) int numFrames = 50; int delta_k = 2; // test space enrichment: should be 2 for 2D bool useMumpsIfAvailable = true; bool convertSolutionsToVTK = false; // when true assumes we've already run with precisely the same options, except without VTK support (so we have a bunch of .soln files) bool usePeriodicBCs = false; bool useConstantConvection = false; cmdp.setOption("polyOrder",&k,"polynomial order for field variable u"); cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment"); cmdp.setOption("numCells",&numCells,"number of cells in x and y directions"); cmdp.setOption("theta",&theta,"theta weight for time-stepping"); cmdp.setOption("numTimeSteps",&numTimeSteps,"number of time steps"); cmdp.setOption("numFrames",&numFrames,"number of frames for export"); cmdp.setOption("usePeriodicBCs", "useDirichletBCs", &usePeriodicBCs); cmdp.setOption("useConstantConvection", "useVariableConvection", &useConstantConvection); cmdp.setOption("useCondensedSolve", "useUncondensedSolve", &useCondensedSolve, "use static condensation to reduce the size of the global solve"); cmdp.setOption("useMumps", "useKLU", &useMumpsIfAvailable, "use MUMPS (if available)"); cmdp.setOption("convertPreComputedSolutionsToVTK", "computeSolutions", &convertSolutionsToVTK); if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) { #ifdef HAVE_MPI MPI_Finalize(); #endif return -1; } bool saveSolutionFiles = true; if (numCells==-1) numCells = numGridPoints / k; if (rank==0) { cout << "solving on " << numCells << " x " << numCells << " mesh " << "of order " << k << ".\n"; } set<int> timeStepsToExport; timeStepsToExport.insert(numTimeSteps); int timeStepsPerFrame = numTimeSteps / (numFrames - 1); if (timeStepsPerFrame==0) timeStepsPerFrame = 1; for (int n=0; n<numTimeSteps; n += timeStepsPerFrame) { timeStepsToExport.insert(n); } int H1Order = k + 1; const static double PI = 3.141592653589793238462; double dt = 2 * PI / numTimeSteps; VarFactory varFactory; // traces: VarPtr qHat = varFactory.fluxVar("\\widehat{q}"); // fields: VarPtr u = varFactory.fieldVar("u", L2); // test functions: VarPtr v = varFactory.testVar("v", HGRAD); FunctionPtr x = Function::xn(1); FunctionPtr y = Function::yn(1); FunctionPtr c; if (useConstantConvection) { c = Function::vectorize(Function::constant(0.5), Function::constant(0.5)); } else { c = Function::vectorize(y-0.5, 0.5-x); } // FunctionPtr c = Function::vectorize(y, x); FunctionPtr n = Function::normal(); BFPtr bf = Teuchos::rcp( new BF(varFactory) ); bf->addTerm(u / dt, v); bf->addTerm(- theta * u, c * v->grad()); // bf->addTerm(theta * u_hat, (c * n) * v); bf->addTerm(qHat, v); double width = 2.0, height = 2.0; int horizontalCells = numCells, verticalCells = numCells; double x0 = -0.5; double y0 = -0.5; if (usePeriodicBCs) { x0 = 0.0; y0 = 0.0; width = 1.0; height = 1.0; } BCPtr bc = BC::bc(); SpatialFilterPtr inflowFilter = Teuchos::rcp( new InflowFilterForClockwisePlanarRotation (x0,x0+width,y0,y0+height,0.5,0.5)); vector< PeriodicBCPtr > periodicBCs; if (! usePeriodicBCs) { // bc->addDirichlet(u_hat, SpatialFilter::allSpace(), Function::zero()); bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary. } else { periodicBCs.push_back(PeriodicBC::xIdentification(x0, x0+width)); periodicBCs.push_back(PeriodicBC::yIdentification(y0, y0+height)); } MeshPtr mesh = MeshFactory::quadMeshMinRule(bf, H1Order, delta_k, width, height, horizontalCells, verticalCells, false, x0, y0, periodicBCs); FunctionPtr u0 = Teuchos::rcp( new Cone_U0(0.0, 0.25, 0.1, 1.0, usePeriodicBCs) ); RHSPtr initialRHS = RHS::rhs(); initialRHS->addTerm(u0 / dt * v); initialRHS->addTerm((1-theta) * u0 * c * v->grad()); IPPtr ip; // ip = Teuchos::rcp( new IP ); // ip->addTerm(v); // ip->addTerm(c * v->grad()); ip = bf->graphNorm(); // create two Solution objects; we'll switch between these for time steps SolutionPtr soln0 = Solution::solution(mesh, bc, initialRHS, ip); soln0->setCubatureEnrichmentDegree(5); FunctionPtr u_soln0 = Function::solution(u, soln0); FunctionPtr qHat_soln0 = Function::solution(qHat, soln0); RHSPtr rhs1 = RHS::rhs(); rhs1->addTerm(u_soln0 / dt * v); rhs1->addTerm((1-theta) * u_soln0 * c * v->grad()); SolutionPtr soln1 = Solution::solution(mesh, bc, rhs1, ip); soln1->setCubatureEnrichmentDegree(5); FunctionPtr u_soln1 = Function::solution(u, soln1); FunctionPtr qHat_soln1 = Function::solution(qHat, soln1); RHSPtr rhs2 = RHS::rhs(); // after the first solve on soln0, we'll swap out initialRHS for rhs2 rhs2->addTerm(u_soln1 / dt * v); rhs2->addTerm((1-theta) * u_soln1 * c * v->grad()); Teuchos::RCP<Solver> solver = Teuchos::rcp( new KluSolver ); #ifdef HAVE_AMESOS_MUMPS if (useMumpsIfAvailable) solver = Teuchos::rcp( new MumpsSolver ); #endif // double energyErrorSum = 0; ostringstream filePrefix; filePrefix << "convectingCone_k" << k << "_t"; int frameNumber = 0; #ifdef USE_HDF5 ostringstream dir_name; dir_name << "convectingCone_k" << k; HDF5Exporter exporter(mesh,dir_name.str()); #endif #ifdef USE_VTK VTKExporter soln0Exporter(soln0,mesh,varFactory); VTKExporter soln1Exporter(soln1,mesh,varFactory); #endif if (convertSolutionsToVTK) { #ifdef USE_VTK if (rank==0) { cout << "Converting .soln files to VTK.\n"; for (int frameNumber=0; frameNumber<=numFrames; frameNumber++) { ostringstream filename; filename << filePrefix.str() << frameNumber << ".soln"; soln0->readFromFile(filename.str()); filename.str(""); filename << filePrefix.str() << frameNumber; soln0Exporter.exportFields(filename.str()); } } #else if (rank==0) cout << "Driver was built without USE_VTK defined. This must be defined to convert solution files to VTK files.\n"; #endif exit(0); } if (timeStepsToExport.find(0) != timeStepsToExport.end()) { map<int,FunctionPtr> solnMap; solnMap[u->ID()] = u0; // project field variables if (rank==0) cout << "About to project initial solution onto mesh.\n"; soln0->projectOntoMesh(solnMap); if (rank==0) cout << "...projected initial solution onto mesh.\n"; ostringstream filename; filename << filePrefix.str() << frameNumber++; if (rank==0) cout << "About to export initial solution.\n"; #ifdef USE_VTK if (rank==0) soln0Exporter.exportFields(filename.str()); #endif #ifdef USE_HDF5 exporter.exportSolution(soln0, varFactory,0); #endif if (saveSolutionFiles) { if (rank==0) { filename << ".soln"; soln0->writeToFile(filename.str()); cout << endl << "wrote " << filename.str() << endl; } } if (rank==0) cout << "...exported initial solution.\n"; } if (rank==0) cout << "About to solve initial time step.\n"; // first time step: soln0->setReportTimingResults(true); // added to gain insight into why MPI blocks in some cases on the server... if (useCondensedSolve) soln0->condensedSolve(solver); else soln0->solve(solver); soln0->setReportTimingResults(false); // energyErrorSum += soln0->energyErrorTotal(); soln0->setRHS(rhs2); if (rank==0) cout << "Solved initial time step.\n"; if (timeStepsToExport.find(1) != timeStepsToExport.end()) { ostringstream filename; filename << filePrefix.str() << frameNumber++; #ifdef USE_VTK if (rank==0) soln0Exporter.exportFields(filename.str()); #endif #ifdef USE_HDF5 exporter.exportSolution(soln0, varFactory); #endif if (saveSolutionFiles) { if (rank==0) { filename << ".soln"; soln0->writeToFile(filename.str()); cout << endl << "wrote " << filename.str() << endl; } } } bool reportTimings = false; for (int n=1; n<numTimeSteps; n++) { bool odd = (n%2)==1; SolutionPtr soln_n = odd ? soln1 : soln0; if (useCondensedSolve) soln_n->solve(solver); else soln_n->solve(solver); if (reportTimings) { if (rank==0) cout << "time step " << n << ", timing report:\n"; soln_n->reportTimings(); } if (rank==0) { cout << "\x1B[2K"; // Erase the entire current line. cout << "\x1B[0E"; // Move to the beginning of the current line. cout << "Solved time step: " << n; flush(cout); } if (timeStepsToExport.find(n+1)!=timeStepsToExport.end()) { ostringstream filename; filename << filePrefix.str() << frameNumber++; #ifdef USE_VTK if (rank==0) { if (odd) { soln1Exporter.exportFields(filename.str()); } else { soln0Exporter.exportFields(filename.str()); } } #endif #ifdef USE_HDF5 double t = n * dt; if (odd) { exporter.exportSolution(soln1, varFactory, t); } else { exporter.exportSolution(soln0, varFactory, t); } #endif if (saveSolutionFiles) { if (rank==0) { filename << ".soln"; if (odd) { soln1->writeToFile(filename.str()); } else { soln0->writeToFile(filename.str()); } cout << endl << "wrote " << filename.str() << endl; } } } // energyErrorSum += soln_n->energyErrorTotal(); } // if (rank==0) cout << "energy error, sum over all time steps: " << energyErrorSum << endl; return 0; }
int main(int argc, char *argv[]) { #ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS cout << "NOTE: enabling floating point exceptions for divide by zero.\n"; _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); #endif Teuchos::GlobalMPISession mpiSession(&argc, &argv); int rank = Teuchos::GlobalMPISession::getRank(); Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options const static double PI = 3.141592653589793238462; bool useCondensedSolve = true; // condensed solve not yet compatible with minimum rule meshes int k = 2; // poly order for u in every direction, including temporal int numCells = 32; // in x, y int numTimeCells = 1; int numTimeSlabs = -1; int numFrames = 201; int delta_k = 3; // test space enrichment: should be 3 for 3D int maxRefinements = 0; // maximum # of refinements on each time slab bool useMumpsIfAvailable = true; bool useConstantConvection = false; double refinementTolerance = 0.1; int checkPointFrequency = 50; // output solution and mesh every 50 time slabs int previousSolutionTimeSlabNumber = -1; string previousSolutionFile = ""; string previousMeshFile = ""; cmdp.setOption("polyOrder",&k,"polynomial order for field variable u"); cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment"); cmdp.setOption("numCells",&numCells,"number of cells in x and y directions"); cmdp.setOption("numTimeCells",&numTimeCells,"number of time axis cells"); cmdp.setOption("numTimeSlabs",&numTimeSlabs,"number of time slabs"); cmdp.setOption("numFrames",&numFrames,"number of frames for export"); cmdp.setOption("useConstantConvection", "useVariableConvection", &useConstantConvection); cmdp.setOption("useCondensedSolve", "useUncondensedSolve", &useCondensedSolve, "use static condensation to reduce the size of the global solve"); cmdp.setOption("useMumps", "useKLU", &useMumpsIfAvailable, "use MUMPS (if available)"); cmdp.setOption("refinementTolerance", &refinementTolerance, "relative error beyond which to stop refining"); cmdp.setOption("maxRefinements", &maxRefinements, "maximum # of refinements on each time slab"); cmdp.setOption("previousSlabNumber", &previousSolutionTimeSlabNumber, "time slab number of previous solution"); cmdp.setOption("previousSolution", &previousSolutionFile, "file with previous solution"); cmdp.setOption("previousMesh", &previousMeshFile, "file with previous mesh"); if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL) { #ifdef HAVE_MPI MPI_Finalize(); #endif return -1; } int H1Order = k + 1; VarFactory varFactory; // traces: VarPtr qHat = varFactory.fluxVar("\\widehat{q}"); // fields: VarPtr u = varFactory.fieldVar("u", L2); // test functions: VarPtr v = varFactory.testVar("v", HGRAD); FunctionPtr x = Function::xn(1); FunctionPtr y = Function::yn(1); FunctionPtr c; if (useConstantConvection) { c = Function::vectorize(Function::constant(0.5), Function::constant(0.5), Function::constant(1.0)); } else { c = Function::vectorize(y-0.5, 0.5-x, Function::constant(1.0)); } FunctionPtr n = Function::normal(); BFPtr bf = Teuchos::rcp( new BF(varFactory) ); bf->addTerm( u, c * v->grad()); bf->addTerm(qHat, v); double width = 2.0, height = 2.0; int horizontalCells = numCells, verticalCells = numCells; int depthCells = numTimeCells; double x0 = -0.5; double y0 = -0.5; double t0 = 0; double totalTime = 2.0 * PI; vector<double> frameTimes; for (int i=0; i<numFrames; i++) { frameTimes.push_back((totalTime*i) / (numFrames-1)); } if (numTimeSlabs==-1) { // want the number of grid points in temporal direction to be about 2000. The temporal length is 2 * PI numTimeSlabs = (int) 2000 / k; } double timeLengthPerSlab = totalTime / numTimeSlabs; if (rank==0) { cout << "solving on " << numCells << " x " << numCells << " x " << numTimeCells << " mesh " << "of order " << k << ".\n"; cout << "numTimeSlabs: " << numTimeSlabs << endl; } SpatialFilterPtr inflowFilter = Teuchos::rcp( new InflowFilterForClockwisePlanarRotation (x0,x0+width,y0,y0+height,0.5,0.5)); vector<double> dimensions; dimensions.push_back(width); dimensions.push_back(height); dimensions.push_back(timeLengthPerSlab); vector<int> elementCounts(3); elementCounts[0] = horizontalCells; elementCounts[1] = verticalCells; elementCounts[2] = depthCells; vector<double> origin(3); origin[0] = x0; origin[1] = y0; origin[2] = t0; Teuchos::RCP<Solver> solver = Teuchos::rcp( new KluSolver ); #ifdef HAVE_AMESOS_MUMPS if (useMumpsIfAvailable) solver = Teuchos::rcp( new MumpsSolver ); #endif // double errorPercentage = 0.5; // for mesh refinements: ask to refine elements that account for 80% of the error in each step // Teuchos::RCP<RefinementStrategy> refinementStrategy; // refinementStrategy = Teuchos::rcp( new ErrorPercentageRefinementStrategy( soln, errorPercentage )); if (maxRefinements != 0) { cout << "Warning: maxRefinements is not 0, but the slice exporter implicitly assumes there won't be any refinements.\n"; } MeshPtr mesh; MeshPtr prevMesh; SolutionPtr prevSoln; mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin); if (rank==0) cout << "Initial mesh has " << mesh->getTopology()->activeCellCount() << " active (leaf) cells " << "and " << mesh->globalDofCount() << " degrees of freedom.\n"; FunctionPtr sideParity = Function::sideParity(); int lastFrameOutputted = -1; SolutionPtr soln; IPPtr ip; ip = bf->graphNorm(); FunctionPtr u0 = Teuchos::rcp( new Cone_U0(0.0, 0.25, 0.1, 1.0, false) ); BCPtr bc = BC::bc(); bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary. bc->addDirichlet(qHat, SpatialFilter::matchingZ(t0), u0); MeshPtr initialMesh = mesh; int startingSlabNumber; if (previousSolutionTimeSlabNumber != -1) { startingSlabNumber = previousSolutionTimeSlabNumber + 1; if (rank==0) cout << "Loading mesh from " << previousMeshFile << endl; prevMesh = MeshFactory::loadFromHDF5(bf, previousMeshFile); prevSoln = Solution::solution(mesh, bc, RHS::rhs(), ip); // include BC and IP objects for sake of condensed dof interpreter setup... prevSoln->setUseCondensedSolve(useCondensedSolve); if (rank==0) cout << "Loading solution from " << previousSolutionFile << endl; prevSoln->loadFromHDF5(previousSolutionFile); double tn = (previousSolutionTimeSlabNumber+1) * timeLengthPerSlab; origin[2] = tn; mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin); FunctionPtr q_prev = Function::solution(qHat, prevSoln); FunctionPtr q_transfer = Teuchos::rcp( new MeshTransferFunction(-q_prev, prevMesh, mesh, tn) ); // negate because the normals go in opposite directions bc = BC::bc(); bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary. bc->addDirichlet(qHat, SpatialFilter::matchingZ(tn), q_transfer); double t_slab_final = (previousSolutionTimeSlabNumber+1) * timeLengthPerSlab; int frameOrdinal = 0; while (frameTimes[frameOrdinal] < t_slab_final) { lastFrameOutputted = frameOrdinal++; } } else { startingSlabNumber = 0; } #ifdef HAVE_EPETRAEXT_HDF5 ostringstream dir_name; dir_name << "spacetime_slice_convectingCone_k" << k << "_startSlab" << startingSlabNumber; map<GlobalIndexType,GlobalIndexType> cellMap; MeshPtr meshSlice = MeshTools::timeSliceMesh(initialMesh, 0, cellMap, H1Order); HDF5Exporter sliceExporter(meshSlice,dir_name.str()); #endif soln = Solution::solution(mesh, bc, RHS::rhs(), ip); soln->setUseCondensedSolve(useCondensedSolve); for(int timeSlab = startingSlabNumber; timeSlab<numTimeSlabs; timeSlab++) { double energyThreshold = 0.2; // for mesh refinements: ask to refine elements that account for 80% of the error in each step Teuchos::RCP<RefinementStrategy> refinementStrategy; refinementStrategy = Teuchos::rcp( new RefinementStrategy( soln, energyThreshold )); FunctionPtr u_spacetime = Function::solution(u, soln); double relativeEnergyError; int refNumber = 0; // { // // DEBUGGING: just to try running the time slicing: // double t_slab_final = (timeStep+1) * timeLengthPerSlab; // int frameOrdinal = lastFrameOutputted + 1; // while (frameTimes[frameOrdinal] < t_slab_final) { // FunctionPtr u_spacetime = Function::solution(u, soln); // ostringstream dir_name; // dir_name << "spacetime_slice_convectingCone_k" << k; // MeshTools::timeSliceExport(dir_name.str(), mesh, u_spacetime, frameTimes[frameOrdinal], "u_slice"); // // cout << "Exported frame " << frameOrdinal << ", t=" << frameTimes[frameOrdinal] << endl; // frameOrdinal++; // } // } do { soln->solve(solver); soln->reportTimings(); #ifdef HAVE_EPETRAEXT_HDF5 ostringstream dir_name; dir_name << "spacetime_convectingCone_k" << k << "_t" << timeSlab; HDF5Exporter exporter(soln->mesh(),dir_name.str()); exporter.exportSolution(soln, varFactory); if (rank==0) cout << "Exported HDF solution for time slab to directory " << dir_name.str() << endl; // string u_name = "u_spacetime"; // exporter.exportFunction(u_spacetime, u_name); ostringstream file_name; file_name << dir_name.str(); bool saveSolutionAndMeshForThisSlab = ((timeSlab + 1) % checkPointFrequency == 0); // +1 so that first output is nth, not first if (saveSolutionAndMeshForThisSlab) { dir_name << ".soln"; soln->saveToHDF5(dir_name.str()); if (rank==0) cout << endl << "wrote " << dir_name.str() << endl; file_name << ".mesh"; soln->mesh()->saveToHDF5(file_name.str()); } #endif FunctionPtr u_soln = Function::solution(u, soln); double solnNorm = u_soln->l2norm(mesh); double energyError = soln->energyErrorTotal(); relativeEnergyError = energyError / solnNorm; if (rank==0) { cout << "Relative energy error for refinement " << refNumber++ << ": " << relativeEnergyError << endl; } if ((relativeEnergyError > refinementTolerance) && (refNumber < maxRefinements)) { refinementStrategy->refine(); if (rank==0) { cout << "After refinement, mesh has " << mesh->getTopology()->activeCellCount() << " active (leaf) cells " << "and " << mesh->globalDofCount() << " degrees of freedom.\n"; } } } while ((relativeEnergyError > refinementTolerance) && (refNumber < maxRefinements)); double t_slab_final = (timeSlab+1) * timeLengthPerSlab; int frameOrdinal = lastFrameOutputted + 1; vector<double> timesForSlab; while (frameTimes[frameOrdinal] < t_slab_final) { double t = frameTimes[frameOrdinal]; if (rank==0) cout << "exporting t=" << t << " on slab " << timeSlab << endl; FunctionPtr sliceFunction = MeshTools::timeSliceFunction(mesh, cellMap, u_spacetime, t); sliceExporter.exportFunction(sliceFunction, "u_slice", t); lastFrameOutputted = frameOrdinal++; } // set up next mesh/solution: FunctionPtr q_prev = Function::solution(qHat, soln); // cout << "Error in setup of q_prev: simple solution doesn't know about the map from the previous time slab to the current one. (TODO: fix this.)\n"; double tn = (timeSlab+1) * timeLengthPerSlab; origin[2] = tn; mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin); FunctionPtr q_transfer = Teuchos::rcp( new MeshTransferFunction(-q_prev, soln->mesh(), mesh, tn) ); // negate because the normals go in opposite directions bc = BC::bc(); bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary. bc->addDirichlet(qHat, SpatialFilter::matchingZ(tn), q_transfer); // IMPORTANT: now that we are ready to step to next soln, nullify BC. If we do not do this, then we have an RCP chain // that extends back to the first time slab, effectively a memory leak. soln->setBC(BC::bc()); soln = Solution::solution(mesh, bc, RHS::rhs(), ip); soln->setUseCondensedSolve(useCondensedSolve); } return 0; }