void axpby(const AV& a, const Kokkos::View< XT,XL,XD,XM,Kokkos::Impl::ViewMPVectorContiguous >& x, const BV& b, const Kokkos::View< YT,YL,YD,YM,Kokkos::Impl::ViewMPVectorContiguous >& y) { typedef Kokkos::Impl::ViewMPVectorContiguous S; typedef Kokkos::View< XT,XL,XD,XM,S > XVector; typedef Kokkos::View< YT,YL,YD,YM,S > YVector; if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) { Kokkos::Impl::raise_error("axpby not implemented for non-constant a or b"); } typename XVector::flat_array_type x_flat = x; typename YVector::flat_array_type y_flat = y; auto aa = Sacado::Value<AV>::eval(a); auto bb = Sacado::Value<BV>::eval(b); axpby( aa, x_flat, bb, y_flat ); }
typename std::enable_if< Kokkos::is_view_mp_vector< Kokkos::View<XD,XP...> >::value && Kokkos::is_view_mp_vector< Kokkos::View<YD,YP...> >::value >::type axpby(const AV& a, const Kokkos::View<XD,XP...>& x, const BV& b, const Kokkos::View<YD,YP...>& y) { typedef Kokkos::View<XD,XP...> XVector; typedef Kokkos::View<YD,YP...> YVector; if (!Sacado::is_constant(a) || !Sacado::is_constant(b)) { Kokkos::Impl::raise_error("axpby not implemented for non-constant a or b"); } typename Kokkos::FlatArrayType<XVector>::type x_flat = x; typename Kokkos::FlatArrayType<YVector>::type y_flat = y; auto aa = Sacado::Value<AV>::eval(a); auto bb = Sacado::Value<BV>::eval(b); axpby( aa, x_flat, bb, y_flat ); }
//============================================================================= void FluidNavierStokes( const int order, ///< order of the time discretization const mesh_t* mesh, ///< mesh structure const bc_t* bc, ///< boundary conditions #ifdef VERSION_Z mesh_t* mesh_o, ///< mesh structure of outer domain double* Vel_o[4], ///< fluid velocity of outer domain #endif const fluid_t* fluid, ///< fluid parameters const particle_t particle[], ///< particles const double tau[3], ///< time discretization coefficients const double dt, ///< time step FluidVar_t* FluidVar ) ///< fluid variables { double *VelTilde1[4] = {NULL,NULL,NULL,NULL}, // Convected velocity from level n *VelTilde2[4] = {NULL,NULL,NULL,NULL}, // Convected velocity from level n-1 FrameVel[4] = { 0.,0.,0.,0. }; // frame velocity for ( int dir = 1 ; dir <= 3 ; dir++ ) { AllocVdouble(mesh->NbOfNodes, VelTilde1[dir]); AllocVdouble(mesh->NbOfNodes, VelTilde2[dir]); } #ifdef VERSION_Z if ( UsingMicroGrid() ) { // find nodes of inner domain that are outside outer domain FindNodesOutside(particle, mesh_o, mesh); // set those nodes velocity to 0. for ( int NodeId = 1 ; NodeId <= mesh->NbOfNodes ; NodeId++ ) if ( mesh->OutsideNodes[NodeId] == true ) for ( int dir = 1 ; dir <= 3 ; dir++ ) FluidVar->Vel[dir][NodeId] = 0.; } #endif //---------------------------------------------------------------------------- // Compute all the velocity rhs contributions //---------------------------------------------------------------------------- // rhs = 0 for ( int dir = 1 ; dir <= 3 ; dir++ ) scal( mesh->NbOfNodes+1, 0., FluidVar->VelRHS[dir] ); // compute frame velocity FrameVelSet( particle, fluid->FrameVelDir, FrameVel ); //---------------------------------------------------------------------------- // Pressure gradient contribution //---------------------------------------------------------------------------- // compute and substract pressure gradient to rhs ApplyOperator("gradient", &FluidVar->Pre, FluidVar->VelRHS); //---------------------------------------------------------------------------- // Convection terms contribution //---------------------------------------------------------------------------- #ifdef VERSION_Z const double *ParticlePos = (order == 1) ? particle[1].Pos1 : particle[1].Pos2; // const double *ParticlePos = particle[1].Pos; if ( UsingMicroGrid() ) ConvectionMicroGrid(order, mesh, dt, FrameVel, ParticlePos, mesh_o, Vel_o, FluidVar->VelOld1, FluidVar->VelOld2, VelTilde1, VelTilde2); else ConvectionMacroGrid(order, mesh, dt, FrameVel, ParticlePos, mesh_o, Vel_o, FluidVar->VelOld1, FluidVar->VelOld2, VelTilde1, VelTilde2); #else Convection(order, mesh, dt, FrameVel, FluidVar->VelOld1, FluidVar->VelOld2, VelTilde1, VelTilde2); #endif // compute the convection terms in Acc for ( int dir = 1 ; dir <= 3 ; dir++ ) axpby( mesh->NbOfNodes+1, -tau[1], VelTilde1[dir], 0., FluidVar->Acc[dir] ); if ( order == 2 ) for ( int dir = 1 ; dir <= 3 ; dir++ ) axpby( mesh->NbOfNodes+1, -tau[2], VelTilde2[dir], 1., FluidVar->Acc[dir]); // weight by mass matrix and substract them from rhs ApplyOperator("convection", FluidVar->Acc, FluidVar->VelRHS); // set convected velocity at previous timestep as initial guess for conjugate gradient // WARNING : this has to be done before we set boundary conditions, otherwise those latter could get overwritten for ( int dir = 1 ; dir <= 3 ; dir++ ) copy(mesh->NbOfNodes+1, VelTilde1[dir], FluidVar->Vel[dir]); //---------------------------------------------------------------------------- // Particle weight contribution //---------------------------------------------------------------------------- if ( UsingMicroGrid() ) GetParticleMomentumContribution(mesh, particle, fluid, FluidVar->VelRHS); //---------------------------------------------------------------------------- // Boundary conditions contribution //---------------------------------------------------------------------------- SetVelBC( mesh->NbOfNodes, bc, FluidVar->Vel ); #ifdef VERSION_Z // prescribe dirichlet bc by interpolating outer domain velocity, this has to be done after SetVelBC ! if ( UsingMicroGrid() ) MassConserveBC( particle[1].Pos, mesh_o, Vel_o, mesh, FluidVar->Vel ); #endif // weight with stifness matrix and substract to rhs ApplyOperator("v_bc", FluidVar->Vel, FluidVar->VelRHS); //---------------------------------------------------------------------------- // Solve for the velocity: advection-diffusion step //---------------------------------------------------------------------------- debug( "\nVelocity diffusion step\n" ); SolveOperator("v_stiffness", mesh->OutsideNodes, FluidVar->VelRHS, FluidVar->Vel); //---------------------------------------------------------------------------- // Solve for the pressure star: projection step //---------------------------------------------------------------------------- debug( "\nPressure prediction step\n" ); // initialize pressure rhs double *PreStar = NULL, // Predicted presssure *PreRHS = NULL; // Presssure RHS AllocVdouble( mesh->NbOfPressureNodes,PreStar ); AllocVdouble( mesh->NbOfPressureNodes,PreRHS ); // compute and add velocity divergence to pressure rhs ApplyOperator("divergence", FluidVar->Vel, &PreRHS); // mutliply pressure rhs by -tau_0 scal( mesh->NbOfPressureNodes+1, -tau[0], PreRHS ); // set previous step pressure as initial guess for conjugate gradient copy( mesh->NbOfFreePressureNodes+1, FluidVar->Pre, PreStar ); SolveOperator("p_stiffness", mesh->OutsideNodes, &PreRHS, &PreStar); //---------------------------------------------------------------------------- // Solve for the velocity: projection step //---------------------------------------------------------------------------- debug( "\nVelocity projection step\n" ); // rhs = 0 for ( int dir = 1 ; dir <= 3 ; dir++ ) scal( mesh->NbOfNodes+1, 0., FluidVar->VelRHS[dir] ); // compute and substract pressure star gradient to velocity rhs ApplyOperator("gradient", &PreStar, FluidVar->VelRHS); // VelTilde2 = 0, used here as temporary array, it will store the un-weigthed gradient for ( int dir = 1 ; dir <= 3 ; dir++ ) scal( mesh->NbOfNodes+1, 0., VelTilde2[dir] ); // compute the un-weigthed gradient SolveOperator("v_mass", mesh->OutsideNodes, FluidVar->VelRHS, VelTilde2); // Remove the non solenoidal part of the velocity, i.e. the un-weigthed gradient for ( int dir = 1 ; dir <= 3 ; dir++ ) axpby(mesh->NbOfNodes+1, 1. / tau[0], VelTilde2[dir], 1., FluidVar->Vel[dir]); //---------------------------------------------------------------------------- // Solve for the pressure: correction step //---------------------------------------------------------------------------- debug( "\nPressure correction step\n" ); // p = p + p* axpby( mesh->NbOfPressureNodes+1, 1., PreStar, 1., FluidVar->Pre ); #ifdef PRE_INCREMENTAL_ROTATIONAL // then solve for - 1/Re Div(vel) = PreMass^{-1} * PreRHS, solve in p* again // solve for PreStar for ( int i = 1 ; i <= mesh->NbOfPressureNodes ; i++ ) PreStar[i] = PreRHS[i] * FluidOperators->PreMassPrec[i]; // p = p + p* = p - 1/Re Div(vel) axpby( mesh->NbOfPressureNodes+1, 1., PreStar, 1., FluidVar->Pre ); #endif // Compute the acceleration field of the fluid, Acc = tau0 * Vel - Acc for ( int dir = 1 ; dir <= 3 ; dir++ ) axpby(mesh->NbOfNodes+1, tau[0], FluidVar->Vel[dir], -1., FluidVar->Acc[dir]); // free local arrays for ( int dir = 1 ; dir <= 3 ; dir++ ) { free(VelTilde1[dir]); free(VelTilde2[dir]); } free(PreStar); free(PreRHS); }
bool use_case_blas_driver(MPI_Comm comm, int num_threads, int num_trials, const std::string &working_directory, const std::string &mesh_filename, const std::string &mesh_type, const std::string &thread_runner, int bucket_size, bool performance_test) { bool output = !performance_test; // If running for performance measurements, turn off output if (stk::parallel_machine_rank(comm) == 0) { std::cout << " stk_mesh Use Case Blas - fill, axpby, dot, norm , begin" << std::endl ; std::cout << "Running '" << mesh_filename << "' case, num_trials = " << num_trials << std::endl; } const AlgorithmRunnerInterface* alg_runner = NULL ; if ( thread_runner.empty() || thread_runner == std::string("NonThreaded") ) { alg_runner = stk::algorithm_runner_non_thread(); } else if ( thread_runner == std::string("TPI") ) { alg_runner = stk::algorithm_runner_tpi(num_threads); } else if ( thread_runner == std::string("TBB") ) { alg_runner = stk::algorithm_runner_tbb(num_threads); } if (alg_runner != NULL) { if (stk::parallel_machine_rank(comm) == 0) std::cout << "Using " << thread_runner << " algorithm runner, num_threads = " << num_threads << std::endl; } else { std::cout << "ERROR, failed to obtain requested AlgorithmRunner '" << thread_runner << "'." << std::endl; return false; } //---------------------------------- // Timing: // [0] = stk::mesh::MetaData creation // [1] = stk::mesh::BulkData creation // [2] = Initialization // [3] = fill and axpby // [4] = dot and norm2 double time_min[9] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; double time_max[9] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; double wtime = 0 ; //-------------------------------------------------------------------- reset_malloc_stats(); if ( 0 == stk::parallel_machine_rank( comm ) ) { std::cout << "stk_mesh performance use case BLAS" << std::endl << " Number Processes = " << stk::parallel_machine_size( comm ) << std::endl ; std::cout.flush(); } //-------------------------------------------------------------------- // Initialize IO system. Registers all element types and storage // types and the exodusII default database type. Ioss::Init::Initializer init_db; { wtime = stk::wall_time(); //------------------------------------------------------------------ // Declare the mesh meta data: element blocks and associated fields stk::mesh::fem::FEMMetaData meta_data( spatial_dimension ); stk::io::MeshData mesh_data; std::string filename = working_directory + mesh_filename; stk::io::create_input_mesh(mesh_type, filename, comm, meta_data, mesh_data); stk::io::define_input_fields(mesh_data, meta_data); Fields fields; use_case_14_declare_fields(fields, meta_data.get_meta_data(meta_data)); //-------------------------------- // Commit (finalize) the meta data. Is now ready to be used // in the creation and management of mesh bulk data. meta_data.commit(); //------------------------------------------------------------------ time_max[0] = stk::wall_dtime( wtime ); //------------------------------------------------------------------ // stk::mesh::BulkData bulk data conforming to the meta data. stk::mesh::BulkData bulk_data(meta_data.get_meta_data(meta_data) , comm, bucket_size); stk::io::populate_bulk_data(bulk_data, mesh_data); //------------------------------------------------------------------ // Create output mesh... (input filename + ".out14") if (output) { filename = working_directory + mesh_filename + ".blas"; stk::io::create_output_mesh(filename, comm, bulk_data, mesh_data); stk::io::define_output_fields(mesh_data, meta_data, true); } stk::app::use_case_14_initialize_nodal_data(bulk_data , *fields.model_coordinates , *fields.coordinates_field , *fields.velocity_field, 1.0 /*dt*/); time_max[1] = stk::wall_dtime( wtime ); //------------------------------------------------------------------ // Ready to run the algorithms: //------------------------------------------------------------------ //------------------------------------------------------------------ time_max[2] = stk::wall_dtime( wtime ); //------------------------------------------------------------------ wtime = stk::wall_time(); double dot1 = 0; for(int n=0; n<num_trials; ++n) { // // Call BLAS algs. // wtime = stk::wall_time(); fill( *alg_runner, bulk_data , stk::mesh::fem::FEMMetaData::NODE_RANK , *fields.velocity_field, 0.2 ); fill( *alg_runner, bulk_data , stk::mesh::fem::FEMMetaData::NODE_RANK , *fields.fint_field, 1.0 ); axpby( *alg_runner, bulk_data , stk::mesh::fem::FEMMetaData::NODE_RANK , 0.01, *fields.model_coordinates , 1.0 , *fields.coordinates_field ); axpby( *alg_runner, bulk_data , stk::mesh::fem::FEMMetaData::NODE_RANK , 0.1, *fields.coordinates_field, 1.0 , *fields.velocity_field ); time_max[3] += stk::wall_dtime( wtime ); dot1 = dot( *alg_runner, bulk_data, stk::mesh::fem::FEMMetaData::NODE_RANK , *fields.velocity_field, *fields.coordinates_field ); double dot2 = dot( *alg_runner, bulk_data, stk::mesh::fem::FEMMetaData::NODE_RANK, *fields.velocity_field, *fields.fint_field ); double norm_1 = norm2(*alg_runner, bulk_data, stk::mesh::fem::FEMMetaData::NODE_RANK, *fields.velocity_field ); double norm_2 = norm2(*alg_runner, bulk_data, stk::mesh::fem::FEMMetaData::NODE_RANK, *fields.coordinates_field ); if ( stk::parallel_machine_rank( comm ) == 0 ) { std::cout << " " << dot1 << " " << dot2 << " " << norm_1 << " " << norm_2 << std::endl; } time_max[4] += stk::wall_dtime( wtime ); if (output) { stk::io::process_output_request(mesh_data, bulk_data, n); } }//end for(..num_trials... if ( stk::parallel_machine_rank( comm ) == 0 ) { //Try to make sure the number gets printed out just the way we want it, //so we can use it as a pass/fail check for a regression test... std::cout.precision(6); std::cout.setf(std::ios_base::scientific, std::ios_base::floatfield); std::cout << "Final dot1: " << dot1 << std::endl; } //------------------------------------------------------------------ #ifdef USE_GNU_MALLOC_HOOKS if (parallel_machine_rank(comm) == 0) { double net_alloc = alloc_MB() - freed_MB(); std::cout << "Mesh creation:" << "\n Total allocated: " << alloc_MB()<<"MB in "<<alloc_blks() << " blocks." << "\n Total freed: " << freed_MB() << "MB in " << freed_blks() << " blocks." << "\n Net allocated: "<<net_alloc << "MB."<<std::endl; } #endif //------------------------------------------------------------------ } time_max[8] = stk::wall_dtime( wtime ); time_min[0] = time_max[0] ; time_min[1] = time_max[1] ; time_min[2] = time_max[2] ; time_min[3] = time_max[3] ; time_min[4] = time_max[4] ; time_min[5] = time_max[5] ; time_min[6] = time_max[6] ; time_min[7] = time_max[7] ; time_min[8] = time_max[8] ; stk::all_reduce( comm , stk::ReduceMax<9>( time_max ) & stk::ReduceMin<9>( time_min ) ); time_max[3] /= num_trials ; time_max[4] /= num_trials ; time_max[5] /= num_trials ; time_max[6] /= num_trials ; time_min[3] /= num_trials ; time_min[4] /= num_trials ; time_min[5] /= num_trials ; time_min[6] /= num_trials ; // [0] = stk::mesh::MetaData creation // [1] = stk::mesh::BulkData creation // [2] = Initialization // [3] = Internal force if ( ! stk::parallel_machine_rank( comm ) ) { std::cout << "stk_mesh performance use case results:" << std::endl << " Number of trials = " << num_trials << std::endl << " Meta-data setup = " << time_min[0] << " : " << time_max[0] << " sec, min : max" << std::endl << " Bulk-data generation = " << time_min[1] << " : " << time_max[1] << " sec, min : max" << std::endl << " Initialization = " << time_min[2] << " : " << time_max[2] << " sec, min : max" << std::endl << " fill & axpby (per-trial) = " << time_min[3] << " : " << time_max[3] << " sec, min : max" << std::endl << " dot & norm2 (per-trial) = " << time_min[4] << " : " << time_max[4] << " sec, min : max" << std::endl << " Mesh destruction = " << time_min[8] << " : " << time_max[8] << " sec, min : max" << std::endl << std::endl ; } return true; }