Example #1
0
Arti3DResult Arti3DApp::Init()
{
	// Turn on all float point exceptions except inexact exceptions.
	_MM_SET_EXCEPTION_MASK(0);
	_MM_SET_EXCEPTION_MASK(_MM_MASK_INEXACT);

	if (SDL_Init(SDL_INIT_VIDEO) != 0)
	{
		fprintf_s(stderr, "SDL_Init Failed!\n");
		return ARTI3D_UNKOWN;
	}

	Arti3DResult a3dr = CreateArti3DWindow(&m_pWindow,
		"Arti3DApp", 
		SDL_WINDOWPOS_UNDEFINED, 
		SDL_WINDOWPOS_UNDEFINED, 
		800, 600, 
		SDL_WINDOW_SHOWN);

	if (a3dr != ARTI3D_OK)
		return ARTI3D_UNKOWN;

	Arti3DDeviceParameter a3dDeviceParameter;
	a3dDeviceParameter.bMultiThread = true;
	a3dDeviceParameter.iWidth = 800;
	a3dDeviceParameter.iHeight = 600;

	a3dr = CreateAndInitializeDevice(&m_pDevice, &a3dDeviceParameter);

	return a3dr;
}
Example #2
0
File: fp.c Project: Kun-Qu/petsc
EXTERN_C_END

#undef __FUNCT__
#define __FUNCT__ "PetscSetFPTrap"
PetscErrorCode  PetscSetFPTrap(PetscFPTrap on)
{
  PetscFunctionBegin;
  if (on == PETSC_FP_TRAP_ON) {
    /* Clear any flags that are currently set so that activating trapping will not immediately call the signal handler. */
    if (feclearexcept(FE_ALL_EXCEPT)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Cannot clear floating point exception flags\n");
#if defined FE_NOMASK_ENV
    /* We could use fesetenv(FE_NOMASK_ENV), but that causes spurious exceptions (like gettimeofday() -> PetscLogDouble). */
    if (feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW | FE_UNDERFLOW) == -1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Cannot activate floating point exceptions\n");
#elif defined PETSC_HAVE_XMMINTRIN_H
    _MM_SET_EXCEPTION_MASK(_MM_MASK_INEXACT);
#else
    /* C99 does not provide a way to modify the environment so there is no portable way to activate trapping. */
#endif
    if (SIG_ERR == signal(SIGFPE,PetscDefaultFPTrap)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Can't set floating point handler\n");
  } else {
    if (fesetenv(FE_DFL_ENV)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Cannot disable floating point exceptions");
    if (SIG_ERR == signal(SIGFPE,SIG_DFL)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Can't clear floating point handler\n");
  }
  _trapmode = on;
  PetscFunctionReturn(0);
}
Example #3
0
/**
 * Toggle floating point exceptions -- courtesy of Cody Permann & MOOSE team
 */
void enableFPE(bool on)
{
#if !defined(LIBMESH_HAVE_FEENABLEEXCEPT) && defined(LIBMESH_HAVE_XMMINTRIN_H) && !defined(__SUNPRO_CC)
  static int flags = 0;
#endif

  if (on)
    {
#ifdef LIBMESH_HAVE_FEENABLEEXCEPT
      feenableexcept(FE_DIVBYZERO | FE_INVALID);
#elif  LIBMESH_HAVE_XMMINTRIN_H
#  ifndef __SUNPRO_CC
      flags = _MM_GET_EXCEPTION_MASK();           // store the flags
      _MM_SET_EXCEPTION_MASK(flags & ~_MM_MASK_INVALID);
#  endif
#endif

#if LIBMESH_HAVE_DECL_SIGACTION
      struct sigaction new_action, old_action;

      // Set up the structure to specify the new action.
      new_action.sa_sigaction = libmesh_handleFPE;
      sigemptyset (&new_action.sa_mask);
      new_action.sa_flags = SA_SIGINFO;

      sigaction (SIGFPE, nullptr, &old_action);
      if (old_action.sa_handler != SIG_IGN)
        sigaction (SIGFPE, &new_action, nullptr);
#endif
    }
  else
    {
#ifdef LIBMESH_HAVE_FEDISABLEEXCEPT
      fedisableexcept(FE_DIVBYZERO | FE_INVALID);
#elif  LIBMESH_HAVE_XMMINTRIN_H
#  ifndef __SUNPRO_CC
      _MM_SET_EXCEPTION_MASK(flags);
#  endif
#endif
      signal(SIGFPE, SIG_DFL);
    }
}
Example #4
0
void EnableFPE()
{
#ifdef __APPLE__
  // Catch all the interesting ones.
  _MM_SET_EXCEPTION_MASK(_MM_MASK_INEXACT | _MM_MASK_UNDERFLOW);
#else

  // Clear existing exceptions.
  feclearexcept(FE_ALL_EXCEPT);

  int flags = FE_DIVBYZERO |
              FE_INVALID   |
//              FE_UNDERFLOW |
              FE_OVERFLOW  ;

  // Enable only those we've selected.
  feenableexcept(flags);
#endif
}
Example #5
0
int main(int argc, char *argv[])
{
#ifdef MINIAERO_FPMATH_CHECK
  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#endif
    
  int num_procs, my_id;
#if WITH_MPI
  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_id);
  double startTime = 0.0, endTime = 0.0;
  startTime = MPI_Wtime();
#else
  time_t startTime=0, endTime=0;
  time(&startTime);
  num_procs=1;
  my_id=0;
#endif

  Options simulation_options;
  simulation_options.read_options_file();

  Kokkos::initialize(argc,argv);
  run(simulation_options);

#if WITH_MPI
  endTime = MPI_Wtime();
  double elapsedTime = endTime-startTime;
#else
  time(&endTime);
  double elapsedTime = difftime(endTime,startTime);
#endif
  Kokkos::finalize();
#if WITH_MPI
  MPI_Finalize();
#endif
  if(my_id==0){
    fprintf(stdout,"\n ... Total elapsed time: %8.2f seconds ...\n",elapsedTime);
  }
  return 0;
}
Example #6
0
File: wlc.c Project: UIKit0/wlc
static void
fpesetup(struct sigaction *action)
{
#if defined(__linux__) || defined(_WIN32) || defined(OSX_SSE_FPE)
   action->sa_handler = fpehandler;
   sigaction(SIGFPE, action, NULL);
#  if defined(__linux__) && defined(__GNUC__)
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
#  endif /* defined(__linux__) && defined(__GNUC__) */
#  if defined(OSX_SSE_FPE)
   return; /* causes issues */
   /* OSX uses SSE for floating point by default, so here
    * use SSE instructions to throw floating point exceptions */
   _MM_SET_EXCEPTION_MASK(_MM_MASK_MASK & ~(_MM_MASK_OVERFLOW | _MM_MASK_INVALID | _MM_MASK_DIV_ZERO));
#  endif /* OSX_SSE_FPE */
#  if defined(_WIN32) && defined(_MSC_VER)
   _controlfp_s(NULL, 0, _MCW_EM); /* enables all fp exceptions */
   _controlfp_s(NULL, _EM_DENORMAL | _EM_UNDERFLOW | _EM_INEXACT, _MCW_EM); /* hide the ones we don't care about */
#  endif /* _WIN32 && _MSC_VER */
#endif
}
Example #7
0
/* set floating point exception stuff
 * this stuff is from blender project. */
static void _glhckSetupFPE(void)
{
#if defined(__linux__) || defined(_WIN32) || defined(OSX_SSE_FPE)
   /* zealous but makes float issues a heck of a lot easier to find!
    * set breakpoints on fpe_handler */
   signal(SIGFPE, _glhckFpeHandler);

#  if defined(__linux__) && defined(__GNUC__)
   feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
#  endif /* defined(__linux__) && defined(__GNUC__) */
#  if defined(OSX_SSE_FPE)
   return; /* causes issues */
   /* OSX uses SSE for floating point by default, so here
    * use SSE instructions to throw floating point exceptions */
   _MM_SET_EXCEPTION_MASK(_MM_MASK_MASK & ~
         (_MM_MASK_OVERFLOW | _MM_MASK_INVALID | _MM_MASK_DIV_ZERO));
#  endif /* OSX_SSE_FPE */
#  if defined(_WIN32) && defined(_MSC_VER)
   _controlfp_s(NULL, 0, _MCW_EM); /* enables all fp exceptions */
   _controlfp_s(NULL, _EM_DENORMAL | _EM_UNDERFLOW | _EM_INEXACT, _MCW_EM); /* hide the ones we don't care about */
#  endif /* _WIN32 && _MSC_VER */
#endif
}
int main(int argc, char *argv[])
{
#ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS
  cout << "NOTE: enabling floating point exceptions for divide by zero.\n";
  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#endif

  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
  int rank = Teuchos::GlobalMPISession::getRank();

#ifdef HAVE_MPI
  Epetra_MpiComm Comm(MPI_COMM_WORLD);
  //cout << "rank: " << rank << " of " << numProcs << endl;
#else
  Epetra_SerialComm Comm;
#endif

  Comm.Barrier(); // set breakpoint here to allow debugger attachment to other MPI processes than the one you automatically attached to.

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  double minTol = 1e-8;

  bool use3D = false;
  int refCount = 10;

  int k = 4; // poly order for field variables
  int delta_k = use3D ? 3 : 2;   // test space enrichment
  int k_coarse = 0;

  bool useMumps = true;
  bool useGMGSolver = true;

  bool enforceOneIrregularity = true;
  bool useStaticCondensation = false;
  bool conformingTraces = false;
  bool useDiagonalScaling = false; // of the global stiffness matrix in GMGSolver

  bool printRefinementDetails = false;

  bool useWeightedGraphNorm = true; // graph norm scaled according to units, more or less

  int numCells = 2;

  int AztecOutputLevel = 1;
  int gmgMaxIterations = 10000;
  int smootherOverlap = 0;
  double relativeTol = 1e-6;
  double D = 1.0; // characteristic length scale

  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");
  cmdp.setOption("k_coarse", &k_coarse, "polynomial order for field variables on coarse mesh");
  cmdp.setOption("numRefs",&refCount,"number of refinements");
  cmdp.setOption("D", &D, "domain dimension");
  cmdp.setOption("useConformingTraces", "useNonConformingTraces", &conformingTraces);
  cmdp.setOption("enforceOneIrregularity", "dontEnforceOneIrregularity", &enforceOneIrregularity);

  cmdp.setOption("smootherOverlap", &smootherOverlap, "overlap for smoother");

  cmdp.setOption("printRefinementDetails", "dontPrintRefinementDetails", &printRefinementDetails);
  cmdp.setOption("azOutput", &AztecOutputLevel, "Aztec output level");
  cmdp.setOption("numCells", &numCells, "number of cells in the initial mesh");
  cmdp.setOption("useScaledGraphNorm", "dontUseScaledGraphNorm", &useWeightedGraphNorm);
//  cmdp.setOption("gmgTol", &gmgTolerance, "tolerance for GMG convergence");
  cmdp.setOption("relativeTol", &relativeTol, "Energy error-relative tolerance for iterative solver.");
  cmdp.setOption("gmgMaxIterations", &gmgMaxIterations, "tolerance for GMG convergence");

  bool enhanceUField = false;
  cmdp.setOption("enhanceUField", "dontEnhanceUField", &enhanceUField);
  cmdp.setOption("useStaticCondensation", "dontUseStaticCondensation", &useStaticCondensation);

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  double width = D, height = D, depth = D;

  VarFactory varFactory;
  // fields:
  VarPtr u = varFactory.fieldVar("u", L2);
  VarPtr sigma = varFactory.fieldVar("\\sigma", VECTOR_L2);

  FunctionPtr n = Function::normal();
  // traces:
  VarPtr u_hat;

  if (conformingTraces)
  {
    u_hat = varFactory.traceVar("\\widehat{u}", u);
  }
  else
  {
    cout << "Note: using non-conforming traces.\n";
    u_hat = varFactory.traceVar("\\widehat{u}", u, L2);
  }
  VarPtr sigma_n_hat = varFactory.fluxVar("\\widehat{\\sigma}_{n}", sigma * n);

  // test functions:
  VarPtr tau = varFactory.testVar("\\tau", HDIV);
  VarPtr v = varFactory.testVar("v", HGRAD);

  BFPtr poissonBF = Teuchos::rcp( new BF(varFactory) );
  FunctionPtr alpha = Function::constant(1); // viscosity

  // tau terms:
  poissonBF->addTerm(sigma / alpha, tau);
  poissonBF->addTerm(-u, tau->div()); // (sigma1, tau1)
  poissonBF->addTerm(u_hat, tau * n);

  // v terms:
  poissonBF->addTerm(- sigma, v->grad()); // (mu sigma1, grad v1)
  poissonBF->addTerm( sigma_n_hat, v);

  int horizontalCells = numCells, verticalCells = numCells, depthCells = numCells;

  vector<double> domainDimensions;
  domainDimensions.push_back(width);
  domainDimensions.push_back(height);

  vector<int> elementCounts;
  elementCounts.push_back(horizontalCells);
  elementCounts.push_back(verticalCells);

  if (use3D)
  {
    domainDimensions.push_back(depth);
    elementCounts.push_back(depthCells);
  }

  MeshPtr mesh, k0Mesh;

  int H1Order = k + 1;
  int H1Order_coarse = k_coarse + 1;
  if (!use3D)
  {
    Teuchos::ParameterList pl;

    map<int,int> trialOrderEnhancements;

    if (enhanceUField)
    {
      trialOrderEnhancements[u->ID()] = 1;
    }

    BFPtr poissonBilinearForm = poissonBF;

    pl.set("useMinRule", true);
    pl.set("bf",poissonBilinearForm);
    pl.set("H1Order", H1Order);
    pl.set("delta_k", delta_k);
    pl.set("horizontalElements", horizontalCells);
    pl.set("verticalElements", verticalCells);
    pl.set("divideIntoTriangles", false);
    pl.set("useConformingTraces", conformingTraces);
    pl.set("trialOrderEnhancements", &trialOrderEnhancements);
    pl.set("x0",(double)0);
    pl.set("y0",(double)0);
    pl.set("width", width);
    pl.set("height",height);

    mesh = MeshFactory::quadMesh(pl);

    pl.set("H1Order", H1Order_coarse);
    k0Mesh = MeshFactory::quadMesh(pl);

  }
  else
  {
    mesh = MeshFactory::rectilinearMesh(poissonBF, domainDimensions, elementCounts, H1Order, delta_k);
    k0Mesh = MeshFactory::rectilinearMesh(poissonBF, domainDimensions, elementCounts, H1Order_coarse, delta_k);
  }

  mesh->registerObserver(k0Mesh); // ensure that the k0 mesh refinements track those of the solution mesh

  RHSPtr rhs = RHS::rhs(); // zero
  FunctionPtr sin_pi_x = Teuchos::rcp( new Sin_ax(PI/D) );
  FunctionPtr sin_pi_y = Teuchos::rcp( new Sin_ay(PI/D) );
  FunctionPtr u_exact = sin_pi_x * sin_pi_y;
  FunctionPtr f = -(2.0 * PI * PI / (D * D)) * sin_pi_x * sin_pi_y;
  rhs->addTerm( f * v );

  BCPtr bc = BC::bc();
  SpatialFilterPtr boundary = SpatialFilter::allSpace();

  bc->addDirichlet(u_hat, boundary, u_exact);

  IPPtr graphNorm;

  FunctionPtr h = Teuchos::rcp( new hFunction() );

  if (useWeightedGraphNorm)
  {
    graphNorm = IP::ip();
    graphNorm->addTerm( tau->div() ); // u
    graphNorm->addTerm( (h / alpha) * tau - h * v->grad() ); // sigma
    graphNorm->addTerm( v ); // boundary term (adjoint to u)
    graphNorm->addTerm( h * tau );

//    // new effort, with the idea that the test norm should be considered in reference space, basically
//    graphNorm = IP::ip();
//    graphNorm->addTerm( tau->div() ); // u
//    graphNorm->addTerm( tau / h - v->grad() ); // sigma
//    graphNorm->addTerm( v / h ); // boundary term (adjoint to u)
//    graphNorm->addTerm( tau / h );
  }
  else
  {
    map<int, double> trialWeights; // on the squared terms in the trial space norm
    trialWeights[u->ID()] = 1.0 / (D * D);
    trialWeights[sigma->ID()] = 1.0;
    graphNorm = poissonBF->graphNorm(trialWeights, 1.0); // 1.0: weight on the L^2 terms
  }

  SolutionPtr solution = Solution::solution(mesh, bc, rhs, graphNorm);
  solution->setUseCondensedSolve(useStaticCondensation);

  mesh->registerSolution(solution); // sign up for projection of old solution onto refined cells.

  double energyThreshold = 0.2;
  RefinementStrategy refinementStrategy( solution, energyThreshold );

  refinementStrategy.setReportPerCellErrors(true);
  refinementStrategy.setEnforceOneIrregularity(enforceOneIrregularity);

  Teuchos::RCP<Solver> coarseSolver, fineSolver;
  if (useMumps)
  {
#ifdef HAVE_AMESOS_MUMPS
    coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) );
#else
    cout << "useMumps=true, but MUMPS is not available!\n";
    exit(0);
#endif
  }
  else
  {
    coarseSolver = Teuchos::rcp( new KluSolver );
  }
  GMGSolver* gmgSolver;

  if (useGMGSolver)
  {
    double tol = relativeTol;
    int maxIters = gmgMaxIterations;
    BCPtr zeroBCs = bc->copyImposingZero();
    gmgSolver = new GMGSolver(zeroBCs, k0Mesh, graphNorm, mesh, solution->getDofInterpreter(),
                              solution->getPartitionMap(), maxIters, tol, coarseSolver,
                              useStaticCondensation);

    gmgSolver->setAztecOutput(AztecOutputLevel);
    gmgSolver->setUseConjugateGradient(true);
    gmgSolver->gmgOperator()->setSmootherType(GMGOperator::IFPACK_ADDITIVE_SCHWARZ);
    gmgSolver->gmgOperator()->setSmootherOverlap(smootherOverlap);

    fineSolver = Teuchos::rcp( gmgSolver );
  }
  else
  {
    fineSolver = coarseSolver;
  }

//  if (rank==0) cout << "experimentally starting by solving with MUMPS on the fine mesh.\n";
//  solution->solve( Teuchos::rcp( new MumpsSolver) );

  solution->solve(fineSolver);

#ifdef HAVE_EPETRAEXT_HDF5
  ostringstream dir_name;
  dir_name << "poissonCavityFlow_k" << k;
  HDF5Exporter exporter(mesh,dir_name.str());
  exporter.exportSolution(solution,varFactory,0);
#endif

#ifdef HAVE_AMESOS_MUMPS
  if (useMumps) coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) );
#endif

  solution->reportTimings();
  if (useGMGSolver) gmgSolver->gmgOperator()->reportTimings();
  for (int refIndex=0; refIndex < refCount; refIndex++)
  {
    double energyError = solution->energyErrorTotal();
    GlobalIndexType numFluxDofs = mesh->numFluxDofs();
    if (rank==0)
    {
      cout << "Before refinement " << refIndex << ", energy error = " << energyError;
      cout << " (using " << numFluxDofs << " trace degrees of freedom)." << endl;
    }
    bool printToConsole = printRefinementDetails && (rank==0);
    refinementStrategy.refine(printToConsole);

    if (useStaticCondensation)
    {
      CondensedDofInterpreter* condensedDofInterpreter = dynamic_cast<CondensedDofInterpreter*>(solution->getDofInterpreter().get());
      if (condensedDofInterpreter != NULL)
      {
        condensedDofInterpreter->reinitialize();
      }
    }

    GlobalIndexType fineDofs = mesh->globalDofCount();
    GlobalIndexType coarseDofs = k0Mesh->globalDofCount();
    if (rank==0)
    {
      cout << "After refinement, coarse mesh has " << k0Mesh->numActiveElements() << " elements and " << coarseDofs << " dofs.\n";
      cout << "  Fine mesh has " << mesh->numActiveElements() << " elements and " << fineDofs << " dofs.\n";
    }

    if (!use3D)
    {
      ostringstream fineMeshLocation, coarseMeshLocation;
      fineMeshLocation << "poissonFineMesh_k" << k << "_ref" << refIndex;
      GnuPlotUtil::writeComputationalMeshSkeleton(fineMeshLocation.str(), mesh, true); // true: label cells
      coarseMeshLocation << "poissonCoarseMesh_k" << k << "_ref" << refIndex;
      GnuPlotUtil::writeComputationalMeshSkeleton(coarseMeshLocation.str(), k0Mesh, true); // true: label cells
    }

    if (useGMGSolver)   // create fresh fineSolver now that the meshes have changed:
    {
#ifdef HAVE_AMESOS_MUMPS
      if (useMumps) coarseSolver = Teuchos::rcp( new MumpsSolver(512, true) );
#endif
      double tol = max(relativeTol * energyError, minTol);
      int maxIters = gmgMaxIterations;
      BCPtr zeroBCs = bc->copyImposingZero();
      gmgSolver = new GMGSolver(zeroBCs, k0Mesh, graphNorm, mesh, solution->getDofInterpreter(),
                                solution->getPartitionMap(), maxIters, tol, coarseSolver, useStaticCondensation);
      gmgSolver->setAztecOutput(AztecOutputLevel);
      gmgSolver->setUseDiagonalScaling(useDiagonalScaling);
      fineSolver = Teuchos::rcp( gmgSolver );
    }

    solution->solve(fineSolver);
    solution->reportTimings();
    if (useGMGSolver) gmgSolver->gmgOperator()->reportTimings();

#ifdef HAVE_EPETRAEXT_HDF5
    exporter.exportSolution(solution,varFactory,refIndex+1);
#endif
  }
  double energyErrorTotal = solution->energyErrorTotal();

  GlobalIndexType numFluxDofs = mesh->numFluxDofs();
  GlobalIndexType numGlobalDofs = mesh->numGlobalDofs();
  if (rank==0)
  {
    cout << "Final mesh has " << mesh->numActiveElements() << " elements and " << numFluxDofs << " trace dofs (";
    cout << numGlobalDofs << " total dofs, including fields).\n";
    cout << "Final energy error: " << energyErrorTotal << endl;
  }

#ifdef HAVE_EPETRAEXT_HDF5
  exporter.exportSolution(solution,varFactory,0);
#endif

  if (!use3D)
  {
    GnuPlotUtil::writeComputationalMeshSkeleton("poissonRefinedMesh", mesh, true);
  }

  coarseSolver = Teuchos::rcp((Solver*) NULL); // without this when useMumps = true and running on one rank, we see a crash on exit, which may have to do with MPI being finalized before coarseSolver is deleted.

  return 0;
}
Example #9
0
  Device::Device (const char* cfg)
  {
    /* check CPU */
    if (!hasISA(ISA)) 
      throw_RTCError(RTC_ERROR_UNSUPPORTED_CPU,"CPU does not support " ISA_STR);

    /* initialize global state */
    State::parseString(cfg);
    if (!ignore_config_files && FileName::executableFolder() != FileName(""))
      State::parseFile(FileName::executableFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
    if (!ignore_config_files && FileName::homeFolder() != FileName(""))
      State::parseFile(FileName::homeFolder()+FileName(".embree" TOSTRING(RTC_VERSION_MAJOR)));
    State::verify();

    /*! do some internal tests */
    assert(isa::Cylinder::verify());

    /*! enable huge page support if desired */
#if defined(__WIN32__)
    if (State::enable_selockmemoryprivilege)
      State::hugepages_success &= win_enable_selockmemoryprivilege(State::verbosity(3));
#endif
    State::hugepages_success &= os_init(State::hugepages,State::verbosity(3));
    
    /*! set tessellation cache size */
    setCacheSize( State::tessellation_cache_size );

    /*! enable some floating point exceptions to catch bugs */
    if (State::float_exceptions)
    {
      int exceptions = _MM_MASK_MASK;
      //exceptions &= ~_MM_MASK_INVALID;
      exceptions &= ~_MM_MASK_DENORM;
      exceptions &= ~_MM_MASK_DIV_ZERO;
      //exceptions &= ~_MM_MASK_OVERFLOW;
      //exceptions &= ~_MM_MASK_UNDERFLOW;
      //exceptions &= ~_MM_MASK_INEXACT;
      _MM_SET_EXCEPTION_MASK(exceptions);
    }

    /* print info header */
    if (State::verbosity(1))
      print();
    if (State::verbosity(2)) 
      State::print();

    /* register all algorithms */
    bvh4_factory = make_unique(new BVH4Factory(enabled_builder_cpu_features, enabled_cpu_features));

#if defined(EMBREE_TARGET_SIMD8)
    bvh8_factory = make_unique(new BVH8Factory(enabled_builder_cpu_features, enabled_cpu_features));
#endif

    /* setup tasking system */
    initTaskingSystem(numThreads);

    /* ray stream SOA to AOS conversion */
#if defined(EMBREE_RAY_PACKETS)
    RayStreamFilterFuncsType rayStreamFilterFuncs;
    SELECT_SYMBOL_DEFAULT_SSE42_AVX_AVX2_AVX512KNL_AVX512SKX(enabled_cpu_features,rayStreamFilterFuncs);
    rayStreamFilters = rayStreamFilterFuncs();
#endif
  }
Example #10
0
int main(int argc, char* argv[])
{
    // Parse CLI arguments.
    TCLAP::CmdLine cmd(
        "OpenGeoSys-6 software.\n"
        "Copyright (c) 2012-2018, OpenGeoSys Community "
        "(http://www.opengeosys.org) "
        "Distributed under a Modified BSD License. "
        "See accompanying file LICENSE.txt or "
        "http://www.opengeosys.org/project/license\n"
        "version: " +
            BaseLib::BuildInfo::git_describe + "\n" +
        "CMake arguments: " +
            BaseLib::BuildInfo::cmake_args,
        ' ',
        BaseLib::BuildInfo::git_describe);

    TCLAP::UnlabeledValueArg<std::string> project_arg(
        "project-file",
        "Path to the ogs6 project file.",
        true,
        "",
        "PROJECT FILE");
    cmd.add(project_arg);

    TCLAP::ValueArg<std::string> outdir_arg("o", "output-directory",
                                            "the output directory to write to",
                                            false, "", "output directory");
    cmd.add(outdir_arg);

    TCLAP::ValueArg<std::string> log_level_arg("l", "log-level",
                                               "the verbosity of logging "
                                               "messages: none, error, warn, "
                                               "info, debug, all",
                                               false,
#ifdef NDEBUG
                                               "info",
#else
                                               "all",
#endif
                                               "log level");
    cmd.add(log_level_arg);

    TCLAP::SwitchArg nonfatal_arg("",
                                  "config-warnings-nonfatal",
                                  "warnings from parsing the configuration "
                                  "file will not trigger program abortion");
    cmd.add(nonfatal_arg);

    TCLAP::SwitchArg unbuffered_cout_arg("", "unbuffered-std-out",
                                         "use unbuffered standard output");
    cmd.add(unbuffered_cout_arg);

#ifndef _WIN32  // TODO: On windows floating point exceptions are not handled
                // currently
    TCLAP::SwitchArg enable_fpe_arg("", "enable-fpe",
                                    "enables floating point exceptions");
    cmd.add(enable_fpe_arg);
#endif  // _WIN32

    cmd.parse(argc, argv);

    // deactivate buffer for standard output if specified
    if (unbuffered_cout_arg.isSet())
        std::cout.setf(std::ios::unitbuf);

    ApplicationsLib::LogogSetup logog_setup;
    logog_setup.setLevel(log_level_arg.getValue());

    INFO("This is OpenGeoSys-6 version %s.",
         BaseLib::BuildInfo::git_describe.c_str());

#ifndef _WIN32  // On windows this command line option is not present.
    // Enable floating point exceptions
    if (enable_fpe_arg.isSet())
#ifdef __APPLE__
        _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#else
        feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
#endif  // __APPLE__
#endif  // _WIN32

#ifdef OGS_USE_PYTHON
    pybind11::scoped_interpreter guard = ApplicationsLib::setupEmbeddedPython();
    (void)guard;
#endif

    BaseLib::RunTime run_time;

    {
        auto const start_time = std::chrono::system_clock::now();
        auto const time_str = BaseLib::formatDate(start_time);
        INFO("OGS started on %s.", time_str.c_str());
    }

    auto ogs_status = EXIT_SUCCESS;

    try
    {
        bool solver_succeeded = false;
        {
            ApplicationsLib::LinearSolverLibrarySetup
                linear_solver_library_setup(argc, argv);
#if defined(USE_PETSC)
            vtkSmartPointer<vtkMPIController> controller =
                vtkSmartPointer<vtkMPIController>::New();
            controller->Initialize(&argc, &argv, 1);
            vtkMPIController::SetGlobalController(controller);

            logog_setup.setFormatter(
                std::make_unique<BaseLib::TemplateLogogFormatterSuppressedGCC<
                    TOPIC_LEVEL_FLAG | TOPIC_FILE_NAME_FLAG |
                    TOPIC_LINE_NUMBER_FLAG>>());
#endif
            run_time.start();

            auto project_config = BaseLib::makeConfigTree(
                project_arg.getValue(), !nonfatal_arg.getValue(),
                "OpenGeoSysProject");

            ProjectData project(*project_config,
                                BaseLib::extractPath(project_arg.getValue()),
                                outdir_arg.getValue());

#ifdef USE_INSITU
            auto isInsituConfigured = false;
            //! \ogs_file_param{prj__insitu}
            if (auto t = project_config->getConfigSubtreeOptional("insitu"))
            {
                InSituLib::Initialize(
                    //! \ogs_file_param{prj__insitu__scripts}
                    t->getConfigSubtree("scripts"),
                    BaseLib::extractPath(project_arg.getValue()));
                isInsituConfigured = true;
            }
#else
            project_config->ignoreConfigParameter("insitu");
#endif

            INFO("Initialize processes.");
            for (auto& p : project.getProcesses())
            {
                p.second->initialize();
            }

            // Check intermediately that config parsing went fine.
            project_config.checkAndInvalidate();
            BaseLib::ConfigTree::assertNoSwallowedErrors();

            BaseLib::ConfigTree::assertNoSwallowedErrors();

            BaseLib::ConfigTree::assertNoSwallowedErrors();

            INFO("Solve processes.");

            auto& time_loop = project.getTimeLoop();
            solver_succeeded = time_loop.loop();

#ifdef USE_INSITU
            if (isInsituConfigured)
                InSituLib::Finalize();
#endif
            INFO("[time] Execution took %g s.", run_time.elapsed());

#if defined(USE_PETSC)
            controller->Finalize(1);
#endif
        }  // This nested scope ensures that everything that could possibly
           // possess a ConfigTree is destructed before the final check below is
           // done.

        BaseLib::ConfigTree::assertNoSwallowedErrors();

        ogs_status = solver_succeeded ? EXIT_SUCCESS : EXIT_FAILURE;
    }
    catch (std::exception& e)
    {
        ERR(e.what());
        ogs_status = EXIT_FAILURE;
    }

    {
        auto const end_time = std::chrono::system_clock::now();
        auto const time_str = BaseLib::formatDate(end_time);
        INFO("OGS terminated on %s.", time_str.c_str());
    }

    return ogs_status;
}
Example #11
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
Example #12
0
int main(int argc, char *argv[])
{
	int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0;
	int64_t n_missing = 0, n_tot = 0;
	gzFile fp;
	kstream_t *ks;
	kstring_t str = {0,0,0};
	int8_t **C = 0;
	double **M, *X, min_maf = 0.0;
	char **names = 0;

//	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO));
	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO));
	while ((c = getopt(argc, argv, "m:")) >= 0) {
		if (c == 'm') min_maf = atof(optarg);
	}
	if (argc - optind == 0) {
		fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n");
		return 1;
	}

	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	if (fp == 0) {
		fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]);
		return 2;
	}
	ks = ks_init(fp);

	// read the matrix into C
	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
		int8_t *q;
		char *p, *name = str.s;
		int i;
		++lineno;
		for (p = str.s; *p && *p != '\t' && *p != ' '; ++p);
		if (*p) {
			*p++ = 0;
			for (; *p && (*p == '\t' || *p == ' '); ++p);
		}
		if (*p == 0) {
			fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno);
			continue;
		}
		if (n_cols != 0) {
			if (n_cols != str.s + str.l - p) {
				fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno);
				continue;
			}
		} else n_cols = str.s + str.l - p;
		if (n_rows == m_rows) {
			m_rows = m_rows? m_rows<<1 : 16;
			C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*));
			names = (char**)realloc(names, m_rows * sizeof(char*));
		}
		names[n_rows] = strdup(name);
		q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double));
		for (i = 0; i < n_cols; ++i) {
			if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0';
			else q[i] = -1, ++n_missing;
			max_hap = max_hap > q[i]? max_hap : q[i];
		}
		n_tot += n_cols;
	}
	free(str.s);
	fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap);
	fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot);

	{ // normalize the matrix into M
		int i, j, *sum, *cnt, n_dropped = 0;
		double *mu, *pp;
		sum = (int*)calloc(n_cols, sizeof(int));
		cnt = (int*)calloc(n_cols, sizeof(int));
		mu = (double*)calloc(n_cols, sizeof(double));
		pp = (double*)calloc(n_cols, sizeof(double));
		for (i = 0; i < n_rows; ++i) {
			int8_t *q = C[i];
			for (j = 0; j < n_cols; ++j)
				if (q[j] >= 0) sum[j] += q[j], ++cnt[j];
		}
		for (j = 0; j < n_cols; ++j) {
			if (cnt[j] > 0) {
				mu[j] = (double)sum[j] / cnt[j];
				pp[j] = mu[j] / max_hap;
				if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped;
			} else ++n_dropped;
		}
		fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped);
		M = (double**)calloc(n_rows, sizeof(double*));
		for (i = 0; i < n_rows; ++i) {
			int8_t *q = C[i];
			double *r;
			r = M[i] = (double*)calloc(n_cols, sizeof(double));
			for (j = 0; j < n_cols; ++j)
				r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j]));
		}
		free(sum); free(cnt); free(mu); free(pp);
		for (i = 0; i < n_rows; ++i) free(C[i]);
		free(C);
	}

	{ // multiplication
		int i, j, k;
		X = (double*)calloc(n_rows * n_rows, sizeof(double));
		for (i = 0; i < n_rows; ++i) {
			double *zi = M[i];
			for (j = 0; j <= i; ++j) {
				double t = 0., *zj = M[j];
				for (k = 0; k < n_cols; ++k)
					t += zi[k] * zj[k];
				X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols;
			}
		}
		for (i = 0; i < n_rows; ++i) free(M[i]);
		free(M);
	}

	{ // print eigan vectors
		double *ev;
		int i, j;
		evsrt_t *evsrt;
		ev = (double*)calloc(n_rows, sizeof(double));
		evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t));
		n_eigen_symm(X, n_rows, ev);
		for (i = 0; i < n_rows; ++i)
			evsrt[i].ev = ev[i], evsrt[i].i = i;
		ks_introsort(ev, n_rows, evsrt);
		for (i = 0; i < n_rows; ++i) {
			printf("%s", names[i]);
			for (j = 0; j < n_rows; ++j)
				printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev);
			putchar('\n');
			free(names[i]);
		}
		free(ev); free(evsrt);
		free(X); free(names);
	}
	
	ks_destroy(ks);
	gzclose(fp);
	return 0;
}
void ProcessImage(std::vector<cv::Mat>& images) {
#ifdef CHECK_NANS
  _MM_SET_EXCEPTION_MASK(
      _MM_GET_EXCEPTION_MASK() &
      ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_DIV_ZERO));
#endif

  frame_count++;
  //  if (poses.size() > 100) {
  //    exit(EXIT_SUCCESS);
  //  }

  Sophus::SE3d guess;
  // If this is a keyframe, set it as one on the tracker.
  prev_delta_t_ba = tracker.t_ba() * prev_t_ba.inverse();

  if (is_prev_keyframe) {
    prev_t_ba = Sophus::SE3d();
  } else {
    prev_t_ba = tracker.t_ba();
  }

  // Add a pose to the poses array
  if (is_prev_keyframe) {
    std::shared_ptr<sdtrack::TrackerPose> new_pose(new sdtrack::TrackerPose);
    if (poses.size() > 0) {
      new_pose->t_wp = poses.back()->t_wp * last_t_ba.inverse();
    }
    poses.push_back(new_pose);
    axes.push_back(
        std::unique_ptr<SceneGraph::GLAxis>(new SceneGraph::GLAxis(0.05)));
    gui_vars.scene_graph.AddChild(axes.back().get());
  }

  guess = prev_delta_t_ba * prev_t_ba;
  if (guess.translation() == Eigen::Vector3d(0, 0, 0) && poses.size() > 1) {
    guess.translation() = Eigen::Vector3d(0, 0, 0.01);
  }

  tracker.AddImage(images, guess);
  tracker.EvaluateTrackResiduals(0, tracker.GetImagePyramid(),
                                 tracker.GetCurrentTracks());

  if (!is_manual_mode) {
    tracker.OptimizeTracks(-1, optimize_landmarks, optimize_pose);
    tracker.PruneTracks();
  }
  // Update the pose t_ab based on the result from the tracker.
  UpdateCurrentPose();

  if (do_keyframing) {
    const double track_ratio =
        (double)tracker.num_successful_tracks() / (double)keyframe_tracks;
    const double total_trans = tracker.t_ba().translation().norm();
    const double total_rot = tracker.t_ba().so3().log().norm();

    bool keyframe_condition =
        track_ratio < 0.8 || total_trans > 0.2 || total_rot > 0.1;

    std::cerr << "\tRatio: " << track_ratio << " trans: " << total_trans
              << " rot: " << total_rot << std::endl;

    if (keyframe_tracks != 0) {
      if (keyframe_condition) {
        is_keyframe = true;
      } else {
        is_keyframe = false;
      }
    }

    // If this is a keyframe, set it as one on the tracker.
    prev_delta_t_ba = tracker.t_ba() * prev_t_ba.inverse();

    if (is_keyframe) {
      tracker.AddKeyframe();
    }
    is_prev_keyframe = is_keyframe;
  } else {
    tracker.AddKeyframe();
  }

  std::cerr << "Num successful : " << tracker.num_successful_tracks()
            << " keyframe tracks: " << keyframe_tracks << std::endl;

  if (!is_manual_mode) {
    BaAndStartNewLandmarks();
  }

  if (is_keyframe) {
    std::cerr << "KEYFRAME." << std::endl;
    keyframe_tracks = tracker.GetCurrentTracks().size();
    std::cerr << "New keyframe tracks: " << keyframe_tracks << std::endl;
  } else {
    std::cerr << "NOT KEYFRAME." << std::endl;
  }

  current_tracks = &tracker.GetCurrentTracks();

#ifdef CHECK_NANS
  _MM_SET_EXCEPTION_MASK(
      _MM_GET_EXCEPTION_MASK() |
      (_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_DIV_ZERO));
#endif

  std::cerr << "FRAME : " << frame_count << " KEYFRAME: " << poses.size()
            << std::endl;
}
Example #14
0
void disable_fpexcept(void)
{
	unsigned int bits;
	bits = _MM_MASK_INVALID | _MM_MASK_DIV_ZERO | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW;
	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() | bits);
}
int main(int argc, char *argv[])
{
#ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS
  cout << "NOTE: enabling floating point exceptions for divide by zero.\n";
  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#endif

  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
  int rank = Teuchos::GlobalMPISession::getRank();

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  bool useCondensedSolve = false; // condensed solve not yet compatible with minimum rule meshes

  int numGridPoints = 32; // in x,y -- idea is to keep the overall order of approximation constant
  int k = 4; // poly order for u
  double theta = 0.5;
  int numTimeSteps = 2000;
  int numCells = -1; // in x, y (-1 so we can set a default if unset from the command line.)
  int numFrames = 50;
  int delta_k = 2;   // test space enrichment: should be 2 for 2D
  bool useMumpsIfAvailable  = true;
  bool convertSolutionsToVTK = false; // when true assumes we've already run with precisely the same options, except without VTK support (so we have a bunch of .soln files)
  bool usePeriodicBCs = false;
  bool useConstantConvection = false;

  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");

  cmdp.setOption("numCells",&numCells,"number of cells in x and y directions");
  cmdp.setOption("theta",&theta,"theta weight for time-stepping");
  cmdp.setOption("numTimeSteps",&numTimeSteps,"number of time steps");
  cmdp.setOption("numFrames",&numFrames,"number of frames for export");

  cmdp.setOption("usePeriodicBCs", "useDirichletBCs", &usePeriodicBCs);
  cmdp.setOption("useConstantConvection", "useVariableConvection", &useConstantConvection);

  cmdp.setOption("useCondensedSolve", "useUncondensedSolve", &useCondensedSolve, "use static condensation to reduce the size of the global solve");
  cmdp.setOption("useMumps", "useKLU", &useMumpsIfAvailable, "use MUMPS (if available)");
  cmdp.setOption("convertPreComputedSolutionsToVTK", "computeSolutions", &convertSolutionsToVTK);

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  bool saveSolutionFiles = true;

  if (numCells==-1) numCells = numGridPoints / k;

  if (rank==0)
  {
    cout << "solving on " << numCells << " x " << numCells << " mesh " << "of order " << k << ".\n";
  }

  set<int> timeStepsToExport;
  timeStepsToExport.insert(numTimeSteps);

  int timeStepsPerFrame = numTimeSteps / (numFrames - 1);
  if (timeStepsPerFrame==0) timeStepsPerFrame = 1;
  for (int n=0; n<numTimeSteps; n += timeStepsPerFrame)
  {
    timeStepsToExport.insert(n);
  }

  int H1Order = k + 1;

  const static double PI  = 3.141592653589793238462;

  double dt = 2 * PI / numTimeSteps;

  VarFactory varFactory;
  // traces:
  VarPtr qHat = varFactory.fluxVar("\\widehat{q}");

  // fields:
  VarPtr u = varFactory.fieldVar("u", L2);

  // test functions:
  VarPtr v = varFactory.testVar("v", HGRAD);

  FunctionPtr x = Function::xn(1);
  FunctionPtr y = Function::yn(1);

  FunctionPtr c;
  if (useConstantConvection)
  {
    c = Function::vectorize(Function::constant(0.5), Function::constant(0.5));
  }
  else
  {
    c = Function::vectorize(y-0.5, 0.5-x);
  }
//  FunctionPtr c = Function::vectorize(y, x);
  FunctionPtr n = Function::normal();

  BFPtr bf = Teuchos::rcp( new BF(varFactory) );

  bf->addTerm(u / dt, v);
  bf->addTerm(- theta * u, c * v->grad());
//  bf->addTerm(theta * u_hat, (c * n) * v);
  bf->addTerm(qHat, v);

  double width = 2.0, height = 2.0;
  int horizontalCells = numCells, verticalCells = numCells;
  double x0 = -0.5;
  double y0 = -0.5;

  if (usePeriodicBCs)
  {
    x0 = 0.0;
    y0 = 0.0;
    width = 1.0;
    height = 1.0;
  }

  BCPtr bc = BC::bc();

  SpatialFilterPtr inflowFilter  = Teuchos::rcp( new InflowFilterForClockwisePlanarRotation (x0,x0+width,y0,y0+height,0.5,0.5));

  vector< PeriodicBCPtr > periodicBCs;
  if (! usePeriodicBCs)
  {
    //  bc->addDirichlet(u_hat, SpatialFilter::allSpace(), Function::zero());
    bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
  }
  else
  {
    periodicBCs.push_back(PeriodicBC::xIdentification(x0, x0+width));
    periodicBCs.push_back(PeriodicBC::yIdentification(y0, y0+height));
  }

  MeshPtr mesh = MeshFactory::quadMeshMinRule(bf, H1Order, delta_k, width, height,
                 horizontalCells, verticalCells, false, x0, y0, periodicBCs);

  FunctionPtr u0 = Teuchos::rcp( new Cone_U0(0.0, 0.25, 0.1, 1.0, usePeriodicBCs) );

  RHSPtr initialRHS = RHS::rhs();
  initialRHS->addTerm(u0 / dt * v);
  initialRHS->addTerm((1-theta) * u0 * c * v->grad());

  IPPtr ip;
//  ip = Teuchos::rcp( new IP );
//  ip->addTerm(v);
//  ip->addTerm(c * v->grad());
  ip = bf->graphNorm();

  // create two Solution objects; we'll switch between these for time steps
  SolutionPtr soln0 = Solution::solution(mesh, bc, initialRHS, ip);
  soln0->setCubatureEnrichmentDegree(5);
  FunctionPtr u_soln0 = Function::solution(u, soln0);
  FunctionPtr qHat_soln0 = Function::solution(qHat, soln0);

  RHSPtr rhs1 = RHS::rhs();
  rhs1->addTerm(u_soln0 / dt * v);
  rhs1->addTerm((1-theta) * u_soln0 * c * v->grad());

  SolutionPtr soln1 = Solution::solution(mesh, bc, rhs1, ip);
  soln1->setCubatureEnrichmentDegree(5);
  FunctionPtr u_soln1 = Function::solution(u, soln1);
  FunctionPtr qHat_soln1 = Function::solution(qHat, soln1);

  RHSPtr rhs2 = RHS::rhs(); // after the first solve on soln0, we'll swap out initialRHS for rhs2
  rhs2->addTerm(u_soln1 / dt * v);
  rhs2->addTerm((1-theta) * u_soln1 * c * v->grad());

  Teuchos::RCP<Solver> solver = Teuchos::rcp( new KluSolver );

#ifdef HAVE_AMESOS_MUMPS
  if (useMumpsIfAvailable) solver = Teuchos::rcp( new MumpsSolver );
#endif

//  double energyErrorSum = 0;

  ostringstream filePrefix;
  filePrefix << "convectingCone_k" << k << "_t";
  int frameNumber = 0;

#ifdef USE_HDF5
  ostringstream dir_name;
  dir_name << "convectingCone_k" << k;
  HDF5Exporter exporter(mesh,dir_name.str());
#endif

#ifdef USE_VTK
  VTKExporter soln0Exporter(soln0,mesh,varFactory);
  VTKExporter soln1Exporter(soln1,mesh,varFactory);
#endif

  if (convertSolutionsToVTK)
  {
#ifdef USE_VTK
    if (rank==0)
    {
      cout << "Converting .soln files to VTK.\n";
      for (int frameNumber=0; frameNumber<=numFrames; frameNumber++)
      {
        ostringstream filename;
        filename << filePrefix.str() << frameNumber << ".soln";
        soln0->readFromFile(filename.str());
        filename.str("");
        filename << filePrefix.str() << frameNumber;
        soln0Exporter.exportFields(filename.str());
      }
    }
#else
    if (rank==0) cout << "Driver was built without USE_VTK defined.  This must be defined to convert solution files to VTK files.\n";
#endif
    exit(0);
  }

  if (timeStepsToExport.find(0) != timeStepsToExport.end())
  {
    map<int,FunctionPtr> solnMap;
    solnMap[u->ID()] = u0; // project field variables
    if (rank==0) cout << "About to project initial solution onto mesh.\n";
    soln0->projectOntoMesh(solnMap);
    if (rank==0) cout << "...projected initial solution onto mesh.\n";
    ostringstream filename;
    filename << filePrefix.str() << frameNumber++;
    if (rank==0) cout << "About to export initial solution.\n";
#ifdef USE_VTK
    if (rank==0) soln0Exporter.exportFields(filename.str());
#endif
#ifdef USE_HDF5
    exporter.exportSolution(soln0, varFactory,0);
#endif
    if (saveSolutionFiles)
    {
      if (rank==0)
      {
        filename << ".soln";
        soln0->writeToFile(filename.str());
        cout << endl << "wrote " << filename.str() << endl;
      }
    }
    if (rank==0) cout << "...exported initial solution.\n";
  }

  if (rank==0) cout << "About to solve initial time step.\n";
  // first time step:
  soln0->setReportTimingResults(true); // added to gain insight into why MPI blocks in some cases on the server...
  if (useCondensedSolve) soln0->condensedSolve(solver);
  else soln0->solve(solver);
  soln0->setReportTimingResults(false);
//  energyErrorSum += soln0->energyErrorTotal();
  soln0->setRHS(rhs2);
  if (rank==0) cout << "Solved initial time step.\n";

  if (timeStepsToExport.find(1) != timeStepsToExport.end())
  {
    ostringstream filename;
    filename << filePrefix.str() << frameNumber++;
#ifdef USE_VTK
    if (rank==0) soln0Exporter.exportFields(filename.str());
#endif
#ifdef USE_HDF5
    exporter.exportSolution(soln0, varFactory);
#endif
    if (saveSolutionFiles)
    {
      if (rank==0)
      {
        filename << ".soln";
        soln0->writeToFile(filename.str());
        cout << endl << "wrote " << filename.str() << endl;
      }
    }
  }

  bool reportTimings = false;

  for (int n=1; n<numTimeSteps; n++)
  {
    bool odd = (n%2)==1;
    SolutionPtr soln_n = odd ? soln1 : soln0;
    if (useCondensedSolve) soln_n->solve(solver);
    else soln_n->solve(solver);
    if (reportTimings)
    {
      if (rank==0) cout << "time step " << n << ", timing report:\n";
      soln_n->reportTimings();
    }
    if (rank==0)
    {
      cout << "\x1B[2K"; // Erase the entire current line.
      cout << "\x1B[0E"; // Move to the beginning of the current line.
      cout << "Solved time step: " << n;
      flush(cout);
    }
    if (timeStepsToExport.find(n+1)!=timeStepsToExport.end())
    {
      ostringstream filename;
      filename << filePrefix.str() << frameNumber++;
#ifdef USE_VTK
      if (rank==0)
      {
        if (odd)
        {
          soln1Exporter.exportFields(filename.str());
        }
        else
        {
          soln0Exporter.exportFields(filename.str());
        }
      }
#endif
#ifdef USE_HDF5
      double t = n * dt;
      if (odd)
      {
        exporter.exportSolution(soln1, varFactory, t);
      }
      else
      {
        exporter.exportSolution(soln0, varFactory, t);
      }
#endif
      if (saveSolutionFiles)
      {
        if (rank==0)
        {
          filename << ".soln";
          if (odd)
          {
            soln1->writeToFile(filename.str());
          }
          else
          {
            soln0->writeToFile(filename.str());
          }
          cout << endl << "wrote " << filename.str() << endl;
        }
      }
    }
//    energyErrorSum += soln_n->energyErrorTotal();
  }

//  if (rank==0) cout << "energy error, sum over all time steps: " << energyErrorSum << endl;

  return 0;
}
int main(int argc, char *argv[])
{
#ifdef ENABLE_INTEL_FLOATING_POINT_EXCEPTIONS
  cout << "NOTE: enabling floating point exceptions for divide by zero.\n";
  _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
#endif

  Teuchos::GlobalMPISession mpiSession(&argc, &argv);
  int rank = Teuchos::GlobalMPISession::getRank();

  Teuchos::CommandLineProcessor cmdp(false,true); // false: don't throw exceptions; true: do return errors for unrecognized options

  const static double PI  = 3.141592653589793238462;

  bool useCondensedSolve = true; // condensed solve not yet compatible with minimum rule meshes

  int k = 2; // poly order for u in every direction, including temporal
  int numCells = 32; // in x, y
  int numTimeCells = 1;
  int numTimeSlabs = -1;
  int numFrames = 201;
  int delta_k = 3;   // test space enrichment: should be 3 for 3D
  int maxRefinements = 0; // maximum # of refinements on each time slab
  bool useMumpsIfAvailable  = true;
  bool useConstantConvection = false;
  double refinementTolerance = 0.1;

  int checkPointFrequency = 50; // output solution and mesh every 50 time slabs

  int previousSolutionTimeSlabNumber = -1;
  string previousSolutionFile = "";
  string previousMeshFile = "";

  cmdp.setOption("polyOrder",&k,"polynomial order for field variable u");
  cmdp.setOption("delta_k", &delta_k, "test space polynomial order enrichment");

  cmdp.setOption("numCells",&numCells,"number of cells in x and y directions");
  cmdp.setOption("numTimeCells",&numTimeCells,"number of time axis cells");
  cmdp.setOption("numTimeSlabs",&numTimeSlabs,"number of time slabs");
  cmdp.setOption("numFrames",&numFrames,"number of frames for export");

  cmdp.setOption("useConstantConvection", "useVariableConvection", &useConstantConvection);

  cmdp.setOption("useCondensedSolve", "useUncondensedSolve", &useCondensedSolve, "use static condensation to reduce the size of the global solve");
  cmdp.setOption("useMumps", "useKLU", &useMumpsIfAvailable, "use MUMPS (if available)");

  cmdp.setOption("refinementTolerance", &refinementTolerance, "relative error beyond which to stop refining");
  cmdp.setOption("maxRefinements", &maxRefinements, "maximum # of refinements on each time slab");

  cmdp.setOption("previousSlabNumber", &previousSolutionTimeSlabNumber, "time slab number of previous solution");
  cmdp.setOption("previousSolution", &previousSolutionFile, "file with previous solution");
  cmdp.setOption("previousMesh", &previousMeshFile, "file with previous mesh");

  if (cmdp.parse(argc,argv) != Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL)
  {
#ifdef HAVE_MPI
    MPI_Finalize();
#endif
    return -1;
  }

  int H1Order = k + 1;

  VarFactory varFactory;
  // traces:
  VarPtr qHat = varFactory.fluxVar("\\widehat{q}");

  // fields:
  VarPtr u = varFactory.fieldVar("u", L2);

  // test functions:
  VarPtr v = varFactory.testVar("v", HGRAD);

  FunctionPtr x = Function::xn(1);
  FunctionPtr y = Function::yn(1);

  FunctionPtr c;
  if (useConstantConvection)
  {
    c = Function::vectorize(Function::constant(0.5), Function::constant(0.5), Function::constant(1.0));
  }
  else
  {
    c = Function::vectorize(y-0.5, 0.5-x, Function::constant(1.0));
  }
  FunctionPtr n = Function::normal();

  BFPtr bf = Teuchos::rcp( new BF(varFactory) );

  bf->addTerm( u, c * v->grad());
  bf->addTerm(qHat, v);

  double width = 2.0, height = 2.0;
  int horizontalCells = numCells, verticalCells = numCells;
  int depthCells = numTimeCells;
  double x0 = -0.5;
  double y0 = -0.5;
  double t0 = 0;

  double totalTime = 2.0 * PI;

  vector<double> frameTimes;
  for (int i=0; i<numFrames; i++)
  {
    frameTimes.push_back((totalTime*i) / (numFrames-1));
  }

  if (numTimeSlabs==-1)
  {
    // want the number of grid points in temporal direction to be about 2000.  The temporal length is 2 * PI
    numTimeSlabs = (int) 2000 / k;
  }
  double timeLengthPerSlab = totalTime / numTimeSlabs;

  if (rank==0)
  {
    cout << "solving on " << numCells << " x " << numCells << " x " << numTimeCells << " mesh " << "of order " << k << ".\n";
    cout << "numTimeSlabs: " << numTimeSlabs << endl;
  }

  SpatialFilterPtr inflowFilter  = Teuchos::rcp( new InflowFilterForClockwisePlanarRotation (x0,x0+width,y0,y0+height,0.5,0.5));

  vector<double> dimensions;
  dimensions.push_back(width);
  dimensions.push_back(height);
  dimensions.push_back(timeLengthPerSlab);

  vector<int> elementCounts(3);
  elementCounts[0] = horizontalCells;
  elementCounts[1] = verticalCells;
  elementCounts[2] = depthCells;

  vector<double> origin(3);
  origin[0] = x0;
  origin[1] = y0;
  origin[2] = t0;

  Teuchos::RCP<Solver> solver = Teuchos::rcp( new KluSolver );

#ifdef HAVE_AMESOS_MUMPS
  if (useMumpsIfAvailable) solver = Teuchos::rcp( new MumpsSolver );
#endif

//  double errorPercentage = 0.5; // for mesh refinements: ask to refine elements that account for 80% of the error in each step
//  Teuchos::RCP<RefinementStrategy> refinementStrategy;
//  refinementStrategy = Teuchos::rcp( new ErrorPercentageRefinementStrategy( soln, errorPercentage ));

  if (maxRefinements != 0)
  {
    cout << "Warning: maxRefinements is not 0, but the slice exporter implicitly assumes there won't be any refinements.\n";
  }

  MeshPtr mesh;

  MeshPtr prevMesh;
  SolutionPtr prevSoln;

  mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin);

  if (rank==0) cout << "Initial mesh has " << mesh->getTopology()->activeCellCount() << " active (leaf) cells " << "and " << mesh->globalDofCount() << " degrees of freedom.\n";

  FunctionPtr sideParity = Function::sideParity();

  int lastFrameOutputted = -1;

  SolutionPtr soln;

  IPPtr ip;
  ip = bf->graphNorm();

  FunctionPtr u0 = Teuchos::rcp( new Cone_U0(0.0, 0.25, 0.1, 1.0, false) );

  BCPtr bc = BC::bc();
  bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
  bc->addDirichlet(qHat, SpatialFilter::matchingZ(t0), u0);

  MeshPtr initialMesh = mesh;

  int startingSlabNumber;
  if (previousSolutionTimeSlabNumber != -1)
  {
    startingSlabNumber = previousSolutionTimeSlabNumber + 1;

    if (rank==0) cout << "Loading mesh from " << previousMeshFile << endl;

    prevMesh = MeshFactory::loadFromHDF5(bf, previousMeshFile);
    prevSoln = Solution::solution(mesh, bc, RHS::rhs(), ip); // include BC and IP objects for sake of condensed dof interpreter setup...
    prevSoln->setUseCondensedSolve(useCondensedSolve);

    if (rank==0) cout << "Loading solution from " << previousSolutionFile << endl;
    prevSoln->loadFromHDF5(previousSolutionFile);

    double tn = (previousSolutionTimeSlabNumber+1) * timeLengthPerSlab;
    origin[2] = tn;
    mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin);

    FunctionPtr q_prev = Function::solution(qHat, prevSoln);
    FunctionPtr q_transfer = Teuchos::rcp( new MeshTransferFunction(-q_prev, prevMesh, mesh, tn) ); // negate because the normals go in opposite directions

    bc = BC::bc();
    bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
    bc->addDirichlet(qHat, SpatialFilter::matchingZ(tn), q_transfer);

    double t_slab_final = (previousSolutionTimeSlabNumber+1) * timeLengthPerSlab;
    int frameOrdinal = 0;

    while (frameTimes[frameOrdinal] < t_slab_final)
    {
      lastFrameOutputted = frameOrdinal++;
    }
  }
  else
  {
    startingSlabNumber = 0;
  }


#ifdef HAVE_EPETRAEXT_HDF5
  ostringstream dir_name;
  dir_name << "spacetime_slice_convectingCone_k" << k << "_startSlab" << startingSlabNumber;
  map<GlobalIndexType,GlobalIndexType> cellMap;
  MeshPtr meshSlice = MeshTools::timeSliceMesh(initialMesh, 0, cellMap, H1Order);
  HDF5Exporter sliceExporter(meshSlice,dir_name.str());
#endif

  soln = Solution::solution(mesh, bc, RHS::rhs(), ip);
  soln->setUseCondensedSolve(useCondensedSolve);

  for(int timeSlab = startingSlabNumber; timeSlab<numTimeSlabs; timeSlab++)
  {
    double energyThreshold = 0.2; // for mesh refinements: ask to refine elements that account for 80% of the error in each step
    Teuchos::RCP<RefinementStrategy> refinementStrategy;
    refinementStrategy = Teuchos::rcp( new RefinementStrategy( soln, energyThreshold ));

    FunctionPtr u_spacetime = Function::solution(u, soln);

    double relativeEnergyError;
    int refNumber = 0;

//    {
//      // DEBUGGING: just to try running the time slicing:
//      double t_slab_final = (timeStep+1) * timeLengthPerSlab;
//      int frameOrdinal = lastFrameOutputted + 1;
//      while (frameTimes[frameOrdinal] < t_slab_final) {
//        FunctionPtr u_spacetime = Function::solution(u, soln);
//        ostringstream dir_name;
//        dir_name << "spacetime_slice_convectingCone_k" << k;
//        MeshTools::timeSliceExport(dir_name.str(), mesh, u_spacetime, frameTimes[frameOrdinal], "u_slice");
//
//        cout << "Exported frame " << frameOrdinal << ", t=" << frameTimes[frameOrdinal] << endl;
//        frameOrdinal++;
//      }
//    }

    do
    {
      soln->solve(solver);
      soln->reportTimings();

#ifdef HAVE_EPETRAEXT_HDF5
      ostringstream dir_name;
      dir_name << "spacetime_convectingCone_k" << k << "_t" << timeSlab;
      HDF5Exporter exporter(soln->mesh(),dir_name.str());
      exporter.exportSolution(soln, varFactory);

      if (rank==0) cout << "Exported HDF solution for time slab to directory " << dir_name.str() << endl;
//      string u_name = "u_spacetime";
//      exporter.exportFunction(u_spacetime, u_name);

      ostringstream file_name;
      file_name << dir_name.str();

      bool saveSolutionAndMeshForThisSlab = ((timeSlab + 1) % checkPointFrequency == 0); // +1 so that first output is nth, not first
      if (saveSolutionAndMeshForThisSlab)
      {
        dir_name << ".soln";
        soln->saveToHDF5(dir_name.str());
        if (rank==0) cout << endl << "wrote " << dir_name.str() << endl;

        file_name << ".mesh";
        soln->mesh()->saveToHDF5(file_name.str());
      }
#endif
      FunctionPtr u_soln = Function::solution(u, soln);

      double solnNorm = u_soln->l2norm(mesh);

      double energyError = soln->energyErrorTotal();
      relativeEnergyError = energyError / solnNorm;

      if (rank==0)
      {
        cout << "Relative energy error for refinement " << refNumber++ << ": " << relativeEnergyError << endl;
      }

      if ((relativeEnergyError > refinementTolerance) && (refNumber < maxRefinements))
      {
        refinementStrategy->refine();
        if (rank==0)
        {
          cout << "After refinement, mesh has " << mesh->getTopology()->activeCellCount() << " active (leaf) cells " << "and " << mesh->globalDofCount() << " degrees of freedom.\n";
        }
      }

    }
    while ((relativeEnergyError > refinementTolerance) && (refNumber < maxRefinements));

    double t_slab_final = (timeSlab+1) * timeLengthPerSlab;
    int frameOrdinal = lastFrameOutputted + 1;
    vector<double> timesForSlab;
    while (frameTimes[frameOrdinal] < t_slab_final)
    {
      double t = frameTimes[frameOrdinal];
      if (rank==0) cout << "exporting t=" << t << " on slab " << timeSlab << endl;
      FunctionPtr sliceFunction = MeshTools::timeSliceFunction(mesh, cellMap, u_spacetime, t);
      sliceExporter.exportFunction(sliceFunction, "u_slice", t);
      lastFrameOutputted = frameOrdinal++;
    }

    // set up next mesh/solution:
    FunctionPtr q_prev = Function::solution(qHat, soln);

//    cout << "Error in setup of q_prev: simple solution doesn't know about the map from the previous time slab to the current one. (TODO: fix this.)\n";

    double tn = (timeSlab+1) * timeLengthPerSlab;
    origin[2] = tn;
    mesh = MeshFactory::rectilinearMesh(bf, dimensions, elementCounts, H1Order, delta_k, origin);

    FunctionPtr q_transfer = Teuchos::rcp( new MeshTransferFunction(-q_prev, soln->mesh(), mesh, tn) ); // negate because the normals go in opposite directions

    bc = BC::bc();
    bc->addDirichlet(qHat, inflowFilter, Function::zero()); // zero BCs enforced at the inflow boundary.
    bc->addDirichlet(qHat, SpatialFilter::matchingZ(tn), q_transfer);

    // IMPORTANT: now that we are ready to step to next soln, nullify BC.  If we do not do this, then we have an RCP chain
    //            that extends back to the first time slab, effectively a memory leak.
    soln->setBC(BC::bc());

    soln = Solution::solution(mesh, bc, RHS::rhs(), ip);
    soln->setUseCondensedSolve(useCondensedSolve);
  }

  return 0;
}