void * msg_alloc(size_t msg_i ,size_t total_msg, size_t total_p, size_t i,size_t ri, void * ptr) { // convert the void pointer argument into a pointer to receiving buffers openfpm::vector<openfpm::vector<unsigned char>> * v = static_cast<openfpm::vector<openfpm::vector<unsigned char>> *>(ptr); /////////////////////// IGNORE THESE LINES IN VCLUSTER DOCUMENTATION //////////////////////// /////////////////////// THEY COME FROM UNIT TESTING ///////////////////////////////////////// if (create_vcluster().getProcessingUnits() <= 8) {if (totp_check) BOOST_REQUIRE_EQUAL(total_p,create_vcluster().getProcessingUnits()-1);} else {if (totp_check) BOOST_REQUIRE_EQUAL(total_p,(size_t)8);} BOOST_REQUIRE_EQUAL(msg_i, global_step); ////////////////////////////////////////////////////////////////////////// // Create the memory to receive the message // msg_i contain the size of the message to receive // i contain the processor id v->get(i).resize(msg_i); // return the pointer of the allocated memory return &(v->get(i).get(0)); }
/*! \brief constructor * * \param dom Box domain * \param n_sub number of sub-domain to create (it is approximated to the biggest 2^n number) * \param lp Local position of the particles * */ ORB(Box dom, size_t n_sub, loc_pos & lp) :v_cl(create_vcluster()),lp(lp) { typedef ORB<dim,T,loc_wg,loc_pos,Box,Tree> ORB_class; dim_div = 0; n_sub = openfpm::math::round_big_2(n_sub); size_t nsub = log2(n_sub); // number of center or mass needed cm.resize(2 << nsub); cm_cnt.resize(2 << nsub); // Every time we divide from 0 to dim-1, calculate how many cycle from 0 to dim-1 we have size_t dim_cycle = nsub / dim; // Create the root node in the graph grp.addVertex(); // unroll bisection cycle bisect_unroll<dim,ORB_class> bu(*this); for (size_t i = 0 ; i < dim_cycle ; i++) { boost::mpl::for_each< boost::mpl::range_c<int,0,dim> >(bu); // bu is recreated several time internaly } // calculate and execute the remaining cycles switch(nsub - dim_cycle * dim) { case 1: do_when_dim_gr_i<dim,1,ORB_class>::bisect_loop(bu); break; case 2: do_when_dim_gr_i<dim,2,ORB_class>::bisect_loop(bu); break; case 3: do_when_dim_gr_i<dim,3,ORB_class>::bisect_loop(bu); break; case 4: do_when_dim_gr_i<dim,4,ORB_class>::bisect_loop(bu); break; case 5: do_when_dim_gr_i<dim,5,ORB_class>::bisect_loop(bu); break; case 6: do_when_dim_gr_i<dim,6,ORB_class>::bisect_loop(bu); break; case 7: do_when_dim_gr_i<dim,7,ORB_class>::bisect_loop(bu); break; default: // To extend on higher dimension create other cases or create runtime // version of local_cm and bisect std::cerr << "Error: " << __FILE__ << ":" << __LINE__ << " ORB is not working for dimension bigger than 8"; } }
/*! \brief Create an empty Matrix * * \param N1 number of row * \param N2 number of colums * \param N1_loc number of local row * */ SparseMatrix(size_t N1, size_t N2, size_t n_row_local) :l_row(n_row_local),l_col(n_row_local) { PETSC_SAFE_CALL(MatCreate(PETSC_COMM_WORLD,&mat)); PETSC_SAFE_CALL(MatSetType(mat,MATMPIAIJ)); PETSC_SAFE_CALL(MatSetSizes(mat,n_row_local,n_row_local,N1,N2)); Vcluster & v_cl = create_vcluster(); openfpm::vector<size_t> vn_row_local; v_cl.allGather(l_row,vn_row_local); v_cl.execute(); // Calculate the starting row for this processor start_row = 0; for (size_t i = 0 ; i < v_cl.getProcessUnitID() ; i++) start_row += vn_row_local.get(i); }
/*! * * \brief Write a set of particle position and properties into HDF5 * * \tparam Pos Vector of positions type * \taparam Prp Vector of properties type * \tparam prp list of properties to output * * \param pos Vector with the positions * \param prp Vector with the properties * \param stop size of the vector to output * */ template<typename VPos, typename VPrp, int ... prp > bool write(const std::string & file, openfpm::vector<VPos> & v_pos, openfpm::vector<VPrp> & v_prp, size_t stop) { Vcluster & v_cl = create_vcluster(); // Open and HDF5 file in parallel hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, v_cl.getMPIComm(), MPI_INFO_NULL); file_id = H5Fcreate(file.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); H5Pclose(plist_id); // Single coordinate positional vector openfpm::vector<typename VPos::coord_type> x_n; x_n.resize(stop); //for each component, fill x_n for (size_t i = 0 ; i < VPos::dims ; i++) { // for (size_t j = 0 ; j < stop ; j++) x_n.get(j) = v_pos.template get<0>(j)[i]; std::stringstream str; str << "x" << i; HDF5CreateDataSet<typename VPos::coord_type>(file_id,str.str(),x_n.getPointer(),stop*sizeof(typename VPos::coord_type)); } // Now we write the properties typedef typename to_boost_vmpl<prp ... >::type v_prp_seq; H5_prop_out<openfpm::vector<VPrp>,has_attributes<VPrp>::value> f(file_id,v_prp,stop); boost::mpl::for_each_ref<v_prp_seq>(f); H5Fclose(file_id); return true; }
/*! \brief Resize the Sparse Matrix * * \param row number for row * \param col number of colums * \param local number of row * \param local number of colums * */ void resize(size_t row, size_t col, size_t l_row, size_t l_col) { this->g_row = row; this->g_col = col; this->l_row = l_row; this->l_col = l_col; PETSC_SAFE_CALL(MatSetSizes(mat,l_row,l_col,g_row,g_col)); Vcluster & v_cl = create_vcluster(); openfpm::vector<size_t> vn_row_local; v_cl.allGather(l_row,vn_row_local); v_cl.execute(); // Calculate the starting row for this processor start_row = 0; for (size_t i = 0 ; i < v_cl.getProcessUnitID() ; i++) start_row += vn_row_local.get(i); }
void create_decomposition2x2(openfpm::vector<openfpm::vector<long unsigned int>> & box_nn_processor, openfpm::vector<SpaceBox<2,float>> & sub_domains) { Vcluster & v_cl = create_vcluster(); box_nn_processor.add(); if (v_cl.getProcessUnitID() == 0) { box_nn_processor.get(0).add(1); box_nn_processor.get(0).add(2); box_nn_processor.get(0).add(3); sub_domains.add(Box<2,float>({0.0,0.0},{0.5,0.5})); } else if (v_cl.getProcessUnitID() == 1) { box_nn_processor.get(0).add(0); box_nn_processor.get(0).add(2); box_nn_processor.get(0).add(3); sub_domains.add(Box<2,float>({0.5,0.0},{1.0,0.5})); } else if (v_cl.getProcessUnitID() == 2) { box_nn_processor.get(0).add(1); box_nn_processor.get(0).add(0); box_nn_processor.get(0).add(3); sub_domains.add(Box<2,float>({0.0,0.5},{0.5,1.0})); } else if (v_cl.getProcessUnitID() == 3) { box_nn_processor.get(0).add(1); box_nn_processor.get(0).add(2); box_nn_processor.get(0).add(0); sub_domains.add(Box<2,float>({0.5,0.5},{1.0,1.0})); } }
int main(int argc, char ** argv) { // Initialize the global VCluster openfpm_init(&argc,&argv); // Vcluster Vcluster & vcl = create_vcluster(); //! [Create CartDecomposition vtk gen] CartDecomposition<2,float> dec(vcl); // Physical domain Box<2,float> box({0.0,0.0},{1.0,1.0}); // division on each direction size_t div[2] = {20,20}; // Define ghost Ghost<2,float> g(0.01); // boundary conditions size_t bc[2] = {PERIODIC,PERIODIC}; // Decompose and write the decomposed graph dec.setParameters(div,box,bc,g); dec.decompose(); // create a ghost border dec.calculateGhostBoxes(); // Write the decomposition dec.write("CartDecomposition/out_"); //! [Create CartDecomposition] // deinitialize the library openfpm_finalize(); }
template<unsigned int ip> void test_no_send_some_peer() { Vcluster & vcl = create_vcluster(); size_t n_proc = vcl.getProcessingUnits(); // Check long communication with some peer not comunication size_t j = 4567; global_step = j; // Processor step long int ps = n_proc / (8 + 1); // send message openfpm::vector<openfpm::vector<unsigned char>> message; // recv message openfpm::vector<openfpm::vector<unsigned char>> recv_message(n_proc); openfpm::vector<size_t> prc; // only even communicate if (vcl.getProcessUnitID() % 2 == 0) { for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { size_t p_id = ((i+1) * ps + vcl.getProcessUnitID()) % n_proc; if (p_id != vcl.getProcessUnitID()) { prc.add(p_id); message.add(); std::ostringstream msg; msg << "Hello from " << vcl.getProcessUnitID() << " to " << p_id; std::string str(msg.str()); message.last().resize(j); memset(message.last().getPointer(),0,j); std::copy(str.c_str(),&(str.c_str())[msg.str().size()],&(message.last().get(0))); } } } recv_message.resize(n_proc); #ifdef VERBOSE_TEST timer t; t.start(); #endif commFunc_null_odd<ip>(vcl,prc,message,msg_alloc,&recv_message); #ifdef VERBOSE_TEST t.stop(); double clk = t.getwct(); double clk_max = clk; size_t size_send_recv = 2 * j * (prc.size()); vcl.sum(size_send_recv); vcl.max(clk_max); vcl.execute(); if (vcl.getProcessUnitID() == 0) std::cout << "(Long pattern: " << method<ip>() << ")Buffer size: " << j << " Bandwidth (Average): " << size_send_recv / vcl.getProcessingUnits() / clk / 1e6 << " MB/s " << " Bandwidth (Total): " << size_send_recv / clk / 1e6 << " MB/s Clock: " << clk << " Clock MAX: " << clk_max <<"\n"; #endif // Check the message for (long int i = 0 ; i < 8 && i < (long int)n_proc ; i++) { long int p_id = (- (i+1) * ps + (long int)vcl.getProcessUnitID()); if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) { // only even processor communicate if (p_id % 2 == 1) continue; std::ostringstream msg; msg << "Hello from " << p_id << " to " << vcl.getProcessUnitID(); std::string str(msg.str()); BOOST_REQUIRE_EQUAL(std::equal(str.c_str(),str.c_str() + str.size() ,&(recv_message.get(p_id).get(0))),true); } else { BOOST_REQUIRE_EQUAL((size_t)0,recv_message.get(p_id).size()); } } }
int main(int argc, char* argv[]) { /*! * \page Vector_5_md_vl_sym_crs Vector 5 molecular dynamic with symmetric Verlet list crossing scheme * * ## Simulation ## {#md_e5_sym_sim_crs} * * The simulation is equal to the simulation explained in the example molecular dynamic * * \see \ref md_e5_sym * * The difference is that we create a symmetric Verlet-list for crossing scheme instead of a normal one * \snippet Vector/5_molecular_dynamic_sym_crs/main.cpp sim verlet * * The rest of the code remain unchanged * * \snippet Vector/5_molecular_dynamic_sym_crs/main.cpp simulation * */ //! \cond [simulation] \endcond double dt = 0.00025; double sigma = 0.1; double r_cut = 3.0*sigma; double r_gskin = 1.3*r_cut; double sigma12 = pow(sigma,12); double sigma6 = pow(sigma,6); openfpm::vector<double> x; openfpm::vector<openfpm::vector<double>> y; openfpm_init(&argc,&argv); Vcluster & v_cl = create_vcluster(); // we will use it do place particles on a 10x10x10 Grid like size_t sz[3] = {10,10,10}; // domain Box<3,float> box({0.0,0.0,0.0},{1.0,1.0,1.0}); // Boundary conditions size_t bc[3]={PERIODIC,PERIODIC,PERIODIC}; // ghost, big enough to contain the interaction radius Ghost<3,float> ghost(r_gskin); ghost.setLow(0,0.0); ghost.setLow(1,0.0); ghost.setLow(2,0.0); vector_dist<3,double, aggregate<double[3],double[3]> > vd(0,box,bc,ghost,BIND_DEC_TO_GHOST); size_t k = 0; size_t start = vd.accum(); auto it = vd.getGridIterator(sz); while (it.isNext()) { vd.add(); auto key = it.get(); vd.getLastPos()[0] = key.get(0) * it.getSpacing(0); vd.getLastPos()[1] = key.get(1) * it.getSpacing(1); vd.getLastPos()[2] = key.get(2) * it.getSpacing(2); vd.template getLastProp<velocity>()[0] = 0.0; vd.template getLastProp<velocity>()[1] = 0.0; vd.template getLastProp<velocity>()[2] = 0.0; vd.template getLastProp<force>()[0] = 0.0; vd.template getLastProp<force>()[1] = 0.0; vd.template getLastProp<force>()[2] = 0.0; k++; ++it; } vd.map(); vd.ghost_get<>(); timer tsim; tsim.start(); //! \cond [sim verlet] \endcond // Get the Cell list structure auto NN = vd.getVerletCrs(r_gskin);; //! \cond [sim verlet] \endcond // calculate forces calc_forces(vd,NN,sigma12,sigma6,r_cut); unsigned long int f = 0; int cnt = 0; double max_disp = 0.0; // MD time stepping for (size_t i = 0; i < 10000 ; i++) { // Get the iterator auto it3 = vd.getDomainIterator(); double max_displ = 0.0; // integrate velicity and space based on the calculated forces (Step1) while (it3.isNext()) { auto p = it3.get(); // here we calculate v(tn + 0.5) vd.template getProp<velocity>(p)[0] += 0.5*dt*vd.template getProp<force>(p)[0]; vd.template getProp<velocity>(p)[1] += 0.5*dt*vd.template getProp<force>(p)[1]; vd.template getProp<velocity>(p)[2] += 0.5*dt*vd.template getProp<force>(p)[2]; Point<3,double> disp({vd.template getProp<velocity>(p)[0]*dt,vd.template getProp<velocity>(p)[1]*dt,vd.template getProp<velocity>(p)[2]*dt}); // here we calculate x(tn + 1) vd.getPos(p)[0] += disp.get(0); vd.getPos(p)[1] += disp.get(1); vd.getPos(p)[2] += disp.get(2); if (disp.norm() > max_displ) max_displ = disp.norm(); ++it3; } if (max_disp < max_displ) max_disp = max_displ; // Because we moved the particles in space we have to map them and re-sync the ghost if (cnt % 10 == 0) { vd.map(); vd.template ghost_get<>(); // Get the Cell list structure vd.updateVerlet(NN,r_gskin,VL_CRS_SYMMETRIC); } else { vd.template ghost_get<>(SKIP_LABELLING); } cnt++; // calculate forces or a(tn + 1) Step 2 calc_forces(vd,NN,sigma12,sigma6,r_cut); // Integrate the velocity Step 3 auto it4 = vd.getDomainIterator(); while (it4.isNext()) { auto p = it4.get(); // here we calculate v(tn + 1) vd.template getProp<velocity>(p)[0] += 0.5*dt*vd.template getProp<force>(p)[0]; vd.template getProp<velocity>(p)[1] += 0.5*dt*vd.template getProp<force>(p)[1]; vd.template getProp<velocity>(p)[2] += 0.5*dt*vd.template getProp<force>(p)[2]; ++it4; } // After every iteration collect some statistic about the confoguration if (i % 100 == 0) { // We write the particle position for visualization (Without ghost) vd.deleteGhost(); vd.write("particles_",f); // we resync the ghost vd.ghost_get<>(); // We calculate the energy double energy = calc_energy(vd,NN,sigma12,sigma6,r_cut); auto & vcl = create_vcluster(); vcl.sum(energy); vcl.max(max_disp); vcl.execute(); // we save the energy calculated at time step i c contain the time-step y contain the energy x.add(i); y.add({energy}); // We also print on terminal the value of the energy // only one processor (master) write on terminal if (vcl.getProcessUnitID() == 0) std::cout << "Energy: " << energy << " " << max_disp << " " << std::endl; max_disp = 0.0; f++; } } tsim.stop(); std::cout << "Time: " << tsim.getwct() << std::endl; //! \cond [simulation] \endcond // Google charts options, it store the options to draw the X Y graph GCoptions options; // Title of the graph options.title = std::string("Energy with time"); // Y axis name options.yAxis = std::string("Energy"); // X axis name options.xAxis = std::string("iteration"); // width of the line options.lineWidth = 1.0; // Object that draw the X Y graph GoogleChart cg; // Add the graph // The graph that it produce is in svg format that can be opened on browser cg.AddLinesGraph(x,y,options); // Write into html format cg.write("gc_plot2_out.html"); //! \cond [google chart] \endcond /*! * \page Vector_5_md_vl_sym_crs Vector 5 molecular dynamic with symmetric Verlet list crossing scheme * * ## Finalize ## {#finalize_v_e5_md_sym_crs} * * At the very end of the program we have always to de-initialize the library * * \snippet Vector/5_molecular_dynamic_sym_crs/main.cpp finalize * */ //! \cond [finalize] \endcond openfpm_finalize(); //! \cond [finalize] \endcond /*! * \page Vector_5_md_vl_sym_crs Vector 5 molecular dynamic with symmetric Verlet list crossing scheme * * ## Full code ## {#full_code_v_e5_md_sym_crs} * * \include Vector/5_molecular_dynamic_sym_crs/main.cpp * */ }
int main(int argc, char* argv[]) { /*! * * \page Vector_4_complex_prop Vector 4 complex properties * * * ## Initialization and vector creation ## * * We first initialize the library and define useful constants * * \see \ref e0_s_init * * \snippet Vector/4_complex_prop/main.cpp lib init * * We also define a custom structure * * \snippet Vector/4_complex_prop/main.cpp struct A * * After we initialize the library we can create a vector with complex properties * with the following line * * \snippet Vector/4_complex_prop/main.cpp vect create * * In this this particular case every particle carry a scalar, * a vector in form of float[3], a Point, a list * in form of vector of float and a list of custom structures, and a vector of vector. * In general particles can have properties of arbitrary complexity. * * \warning For arbitrary complexity mean that we can use any openfpm data structure with and arbitrary nested complexity. * For example a openfpm::vector<aggregate<grid_cpu<openfpm::vector<aggregate<double,double[3]>>>,openfpm::vector<float>> is valid * \verbatim particle * vector / \ / \ grid vector<float> /\ / \ double double[3] * \endverbatim * * Our custom data-structure A is defined below. Note that this data-structure * does not have pointers * * \snippet Vector/4_complex_prop/main.cpp struct A * * * \warning custom data structure are allowed only if they does not have pointer. * In case they have pointer we have to define how to serialize our data-structure * * \see \ref vector_example_cp_ser * */ //! \cond [lib init] \endcond // initialize the library openfpm_init(&argc,&argv); // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y Box<2,float> domain({0.0,0.0},{1.0,1.0}); // Here we define the boundary conditions of our problem size_t bc[2]={PERIODIC,PERIODIC}; // extended boundary around the domain, and the processor domain Ghost<2,float> g(0.01); // the scalar is the element at position 0 in the aggregate constexpr int scalar = 0; // the vector is the element at position 1 in the aggregate constexpr int vector = 1; // the tensor is the element at position 2 in the aggregate constexpr int point = 2; // A list1 constexpr int list = 3; // A listA constexpr int listA = 4; // A list of list constexpr int listlist = 5; //! \cond [lib init] \endcond //! \cond [struct A] \endcond // The custom structure struct A { float p1; int p2; A() {}; A(float p1, int p2) :p1(p1),p2(p2) {} }; //! \cond [struct A] \endcond //! \cond [vect create] \endcond vector_dist<2,float, aggregate<float, float[3], Point<3,double>, openfpm::vector<float>, openfpm::vector<A>, openfpm::vector<openfpm::vector<float>>> > vd(4096,domain,bc,g); //! \cond [vect create] \endcond /*! * * \page Vector_4_complex_prop Vector 4 complex properties * * * ## Assign values to properties ## * * Assign values to properties does not changes, from the simple case. Consider * now that each particle has a list, so when we can get the property listA for particle p * and resize such list with **vd.getProp<listA>(p).resize(...)**. We can add new elements at the * end with **vd.getProp<listA>(p).add(...)** and get some element of this list with **vd.getProp<listA>(p).get(i)**. * More in general vd.getProp<listA>(p) return a reference to the openfpm::vector contained by the particle. * * \snippet Vector/4_complex_prop/main.cpp vect assign * */ //! \cond [vect assign] \endcond auto it = vd.getDomainIterator(); while (it.isNext()) { auto p = it.get(); // we define x, assign a random position between 0.0 and 1.0 vd.getPos(p)[0] = (float)rand() / RAND_MAX; // we define y, assign a random position between 0.0 and 1.0 vd.getPos(p)[1] = (float)rand() / RAND_MAX; vd.getProp<scalar>(p) = 1.0; vd.getProp<vector>(p)[0] = 1.0; vd.getProp<vector>(p)[1] = 1.0; vd.getProp<vector>(p)[2] = 1.0; vd.getProp<point>(p).get(0) = 1.0; vd.getProp<point>(p).get(1) = 1.0; vd.getProp<point>(p).get(2) = 1.0; size_t n_cp = (float)10.0 * rand()/RAND_MAX; vd.getProp<listA>(p).resize(n_cp); for (size_t i = 0 ; i < n_cp ; i++) { vd.getProp<list>(p).add(i + 10); vd.getProp<list>(p).add(i + 20); vd.getProp<list>(p).add(i + 30); vd.getProp<listA>(p).get(i) = A(i+10.0,i+20.0); } vd.getProp<listlist>(p).resize(2); vd.getProp<listlist>(p).get(0).resize(2); vd.getProp<listlist>(p).get(1).resize(2); vd.getProp<listlist>(p).get(0).get(0) = 1.0; vd.getProp<listlist>(p).get(0).get(1) = 2.0; vd.getProp<listlist>(p).get(1).get(0) = 3.0; vd.getProp<listlist>(p).get(1).get(1) = 4.0; // next particle ++it; } //! \cond [vect assign] \endcond /*! * * \page Vector_4_complex_prop Vector 4 complex properties * * * ## Mapping and ghost_get ## * * Particles are redistributed across processors all properties are communicated but instead of * using map we use **map_list** that we can use to select properties. * A lot of time complex properties can be recomputed and communicate them is not a good idea. * The same concept also apply for ghost_get. In general we choose which properties to communicate * * * \see \ref e0_s_map * * \see \ref e1_part_ghost * * \snippet Vector/4_complex_prop/main.cpp vect map ghost * */ //! \cond [vect map ghost] \endcond // Particles are redistribued across the processors but only the scalar,vector, and point properties // are transfert vd.map_list<scalar,vector,point,list,listA,listlist>(); // Synchronize the ghost vd.ghost_get<scalar,vector,point,listA,listlist>(); //! \cond [vect map ghost] \endcond /*! * * \page Vector_4_complex_prop Vector 4 complex properties * * * ## Output and VTK visualization ## * * Vector with complex properties can be still be visualized, because unknown properties are * automatically excluded * * \see \ref e0_s_vis_vtk * * \snippet Vector/4_complex_prop/main.cpp vtk * */ //! \cond [vtk] \endcond vd.write("particles"); //! \cond [vtk] \endcond /*! * * \page Vector_4_complex_prop Vector 4 complex properties * * ## Print 4 particles in the ghost area ## * * Here we print that the first 4 particles to show that the list of A and the list of list are filled * and the ghosts contain the correct information * * \snippet Vector/4_complex_prop/main.cpp print ghost info * */ //! \cond [print ghost info] \endcond size_t fg = vd.size_local(); Vcluster & v_cl = create_vcluster(); if (v_cl.getProcessUnitID() == 0) { for ( ; fg < vd.size_local()+4 ; fg++) { std::cout << "List of A" << std::endl; for (size_t i = 0 ; i < vd.getProp<listA>(fg).size() ; i++) std::cout << "Element: " << i << " p1=" << vd.getProp<listA>(fg).get(i).p1 << " p2=" << vd.getProp<listA>(fg).get(i).p2 << std::endl; std::cout << "List of list" << std::endl; for (size_t i = 0 ; i < vd.getProp<listlist>(fg).size() ; i++) { for (size_t j = 0 ; j < vd.getProp<listlist>(fg).get(i).size() ; j++) std::cout << "Element: " << i << " " << j << " " << vd.getProp<listlist>(fg).get(i).get(j) << std::endl; } } } //! \cond [print ghost info] \endcond /*! * \page Vector_4_complex_prop Vector 4 complex properties * * ## Finalize ## {#finalize} * * At the very end of the program we have always to de-initialize the library * * \snippet Vector/4_complex_prop/main.cpp finalize * */ //! \cond [finalize] \endcond openfpm_finalize(); //! \cond [finalize] \endcond /*! * \page Vector_4_complex_prop Vector 4 complex properties * * # Full code # {#code} * * \include Vector/4_complex_prop/main.cpp * */ }
/*! \brief solve complex use Krylov solver in combination with Block-Jacobi + Nested * * * Solve the system using block-jacobi preconditioner + nested solver for each block * */ void solve_complex(Mat & A_, const Vec & b_, Vec & x_) { // We get the size of the Matrix A PetscInt row; PetscInt col; PetscInt row_loc; PetscInt col_loc; PetscInt * blks; PetscInt nlocal; PetscInt first; KSP *subksp; PC subpc; PETSC_SAFE_CALL(MatGetSize(A_,&row,&col)); PETSC_SAFE_CALL(MatGetLocalSize(A_,&row_loc,&col_loc)); // Set the preconditioner to Block-jacobi PC pc; KSPGetPC(ksp,&pc); PCSetType(pc,PCBJACOBI); PETSC_SAFE_CALL(KSPSetType(ksp,KSPGMRES)); PetscInt m = 1; PetscMalloc1(m,&blks); for (size_t i = 0 ; i < m ; i++) blks[i] = row_loc; PCBJacobiSetLocalBlocks(pc,m,blks); PetscFree(blks); KSPSetUp(ksp); PCSetUp(pc); PCBJacobiGetSubKSP(pc,&nlocal,&first,&subksp); for (size_t i = 0; i < nlocal; i++) { KSPGetPC(subksp[i],&subpc); PCSetType(subpc,PCLU); // PCSetType(subpc,PCJACOBI); KSPSetType(subksp[i],KSPPREONLY); // PETSC_SAFE_CALL(KSPSetConvergenceTest(ksp,KSPConvergedDefault,NULL,NULL)); // KSPSetTolerances(subksp[i],1.e-6,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT); /* if (!rank) { if (i%2) { PCSetType(subpc,PCILU); } else { PCSetType(subpc,PCNONE); KSPSetType(subksp[i],KSPBCGS); KSPSetTolerances(subksp[i],1.e-6,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT); } } else { PCSetType(subpc,PCJACOBI); KSPSetType(subksp[i],KSPGMRES); KSPSetTolerances(subksp[i],1.e-6,PETSC_DEFAULT,PETSC_DEFAULT,PETSC_DEFAULT); }*/ } KSPSolve(ksp,b_,x_); auto & v_cl = create_vcluster(); if (try_solve == true) { // calculate error statistic about the solution solError err = statSolutionError(A_,b_,x_); if (v_cl.getProcessUnitID() == 0) { std::cout << "Method: " << s_type << " " << " pre-conditoner: " << PCJACOBI << " iterations: " << err.its << std::endl; std::cout << "Norm of error: " << err.err_norm << " Norm infinity: " << err.err_inf << std::endl; } } }
/*! \brief solve simple use a Krylov solver + Simple selected Parallel Pre-conditioner * */ void solve_Krylov_simple(Mat & A_, const Vec & b_, Vec & x_) { PETSC_SAFE_CALL(KSPSetType(ksp,s_type)); // We set the size of x according to the Matrix A PetscInt row; PetscInt col; PetscInt row_loc; PetscInt col_loc; PETSC_SAFE_CALL(MatGetSize(A_,&row,&col)); PETSC_SAFE_CALL(MatGetLocalSize(A_,&row_loc,&col_loc)); PC pc; // We set the Matrix operators PETSC_SAFE_CALL(KSPSetOperators(ksp,A_,A_)); // We set the pre-conditioner PETSC_SAFE_CALL(KSPGetPC(ksp,&pc)); // PETSC_SAFE_CALL(PCSetType(pc,PCJACOBI)); PETSC_SAFE_CALL(PCSetType(pc,PCHYPRE)); // PCGAMGSetNSmooths(pc,0); PCFactorSetShiftType(pc, MAT_SHIFT_NONZERO); PCFactorSetShiftAmount(pc, PETSC_DECIDE); PCHYPRESetType(pc, "boomeramg"); MatSetBlockSize(A_,4); PetscOptionsSetValue("-pc_hypre_boomeramg_print_statistics","2"); PetscOptionsSetValue("-pc_hypre_boomeramg_max_iter","1000"); PetscOptionsSetValue("-pc_hypre_boomeramg_nodal_coarsen","true"); PetscOptionsSetValue("-pc_hypre_boomeramg_relax_type_all","SOR/Jacobi"); PetscOptionsSetValue("-pc_hypre_boomeramg_coarsen_type","Falgout"); PetscOptionsSetValue("-pc_hypre_boomeramg_cycle_type","W"); PetscOptionsSetValue("-pc_hypre_boomeramg_max_levels","10"); KSPSetFromOptions(ksp); // if we are on on best solve set-up a monitor function if (try_solve == true) { // for bench-mark we are interested in non-preconditioned norm PETSC_SAFE_CALL(KSPMonitorSet(ksp,monitor,&vres,NULL)); // Disable convergence check PETSC_SAFE_CALL(KSPSetConvergenceTest(ksp,KSPConvergedSkip,NULL,NULL)); } // Solve the system PETSC_SAFE_CALL(KSPSolve(ksp,b_,x_)); auto & v_cl = create_vcluster(); // if (try_solve == true) // { // calculate error statistic about the solution solError err = statSolutionError(A_,b_,x_); if (v_cl.getProcessUnitID() == 0) { std::cout << "Method: " << s_type << " " << " pre-conditoner: " << PCJACOBI << " iterations: " << err.its << std::endl; std::cout << "Norm of error: " << err.err_norm << " Norm infinity: " << err.err_inf << std::endl; } // } }
void solve_ASM(Mat & A_, const Vec & b_, Vec & x_) { PETSC_SAFE_CALL(KSPSetType(ksp,s_type)); // We set the size of x according to the Matrix A PetscInt row; PetscInt col; PetscInt row_loc; PetscInt col_loc; PETSC_SAFE_CALL(MatGetSize(A_,&row,&col)); PETSC_SAFE_CALL(MatGetLocalSize(A_,&row_loc,&col_loc)); PC pc; // We set the Matrix operators PETSC_SAFE_CALL(KSPSetOperators(ksp,A_,A_)); // We set the pre-conditioner PETSC_SAFE_CALL(KSPGetPC(ksp,&pc)); PETSC_SAFE_CALL(PCSetType(pc,PCASM)); //////////////// ///////////////////// // PCASMSetLocalSubdomains(pc); PCASMSetOverlap(pc,5); KSP *subksp; /* array of KSP contexts for local subblocks */ PetscInt nlocal,first; /* number of local subblocks, first local subblock */ PC subpc; /* PC context for subblock */ PetscBool isasm; /* Set runtime options */ // KSPSetFromOptions(ksp); /* Flag an error if PCTYPE is changed from the runtime options */ // PetscObjectTypeCompare((PetscObject)pc,PCASM,&isasm); // if (!isasm) SETERRQ(PETSC_COMM_WORLD,1,"Cannot Change the PCTYPE when manually changing the subdomain solver settings"); /* Call KSPSetUp() to set the block Jacobi data structures (including creation of an internal KSP context for each block). Note: KSPSetUp() MUST be called before PCASMGetSubKSP(). */ KSPSetUp(ksp); /* Extract the array of KSP contexts for the local blocks */ PCASMGetSubKSP(pc,&nlocal,&first,&subksp); /* Loop over the local blocks, setting various KSP options for each block. */ for (size_t i = 0 ; i < nlocal ; i++) { KSPGetPC(subksp[i],&subpc); // PCFactorSetFill(subpc,30); PCFactorSetLevels(subpc,5); PCSetType(subpc,PCILU); KSPSetType(subksp[i],KSPRICHARDSON); KSPSetTolerances(subksp[i],1.e-3,0.1,PETSC_DEFAULT,PETSC_DEFAULT); } // if we are on on best solve set-up a monitor function if (try_solve == true) { // for bench-mark we are interested in non-preconditioned norm PETSC_SAFE_CALL(KSPMonitorSet(ksp,monitor,&vres,NULL)); // Disable convergence check PETSC_SAFE_CALL(KSPSetConvergenceTest(ksp,KSPConvergedSkip,NULL,NULL)); } // KSPGMRESSetRestart(ksp,100); // Solve the system PETSC_SAFE_CALL(KSPSolve(ksp,b_,x_)); KSPConvergedReason reason; KSPGetConvergedReason(ksp,&reason); std::cout << "Reason: " << reason << std::endl; auto & v_cl = create_vcluster(); // if (try_solve == true) // { // calculate error statistic about the solution solError err = statSolutionError(A_,b_,x_); if (v_cl.getProcessUnitID() == 0) { std::cout << "Method: " << s_type << " " << " pre-conditoner: " << PCJACOBI << " iterations: " << err.its << std::endl; std::cout << "Norm of error: " << err.err_norm << " Norm infinity: " << err.err_inf << std::endl; } // } }
template<unsigned int ip> void test_known() { Vcluster & vcl = create_vcluster(); // send/recv messages global_rank = vcl.getProcessUnitID(); size_t n_proc = vcl.getProcessingUnits(); // Checking short communication pattern for (size_t s = 0 ; s < N_TRY ; s++) { for (size_t j = 32 ; j < N_LOOP ; j*=2) { global_step = j; // send message openfpm::vector<openfpm::vector<unsigned char>> message; // recv message openfpm::vector<openfpm::vector<unsigned char>> recv_message(n_proc); recv_message.reserve(n_proc); openfpm::vector<size_t> prc_recv; openfpm::vector<size_t> recv_sz; openfpm::vector<size_t> prc; for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { size_t p_id = (i + 1 + vcl.getProcessUnitID()) % n_proc; if (p_id != vcl.getProcessUnitID()) { prc.add(p_id); message.add(); std::ostringstream msg; msg << "Hello from " << vcl.getProcessUnitID() << " to " << p_id; std::string str(msg.str()); message.last().resize(j); memset(message.last().getPointer(),0,j); std::copy(str.c_str(),&(str.c_str())[msg.str().size()],&(message.last().get(0))); } } recv_message.resize(n_proc); // The pattern is not really random preallocate the receive buffer for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { long int p_id = vcl.getProcessUnitID() - i - 1; if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) { prc_recv.add(p_id); recv_message.get(p_id).resize(j); recv_sz.add(j); } } #ifdef VERBOSE_TEST timer t; t.start(); #endif vcl.sendrecvMultipleMessagesNBX(prc,message,prc_recv,recv_sz,msg_alloc,&recv_message); #ifdef VERBOSE_TEST t.stop(); double clk = t.getwct(); double clk_max = clk; size_t size_send_recv = 2 * j * (prc.size()); vcl.sum(size_send_recv); vcl.max(clk_max); vcl.execute(); if (vcl.getProcessUnitID() == 0) std::cout << "(Short pattern: " << method<ip>() << ")Buffer size: " << j << " Bandwidth (Average): " << size_send_recv / vcl.getProcessingUnits() / clk / 1e6 << " MB/s " << " Bandwidth (Total): " << size_send_recv / clk / 1e6 << " MB/s Clock: " << clk << " Clock MAX: " << clk_max <<"\n"; #endif // Check the message for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { long int p_id = vcl.getProcessUnitID() - i - 1; if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) { std::ostringstream msg; msg << "Hello from " << p_id << " to " << vcl.getProcessUnitID(); std::string str(msg.str()); BOOST_REQUIRE_EQUAL(std::equal(str.c_str(),str.c_str() + str.size() ,&(recv_message.get(p_id).get(0))),true); } else { BOOST_REQUIRE_EQUAL((size_t)0,recv_message.get(p_id).size()); } } } } }
int main(int argc, char* argv[]) { /*! * * \page Vector_4_complex_prop_ser Vector 4 property serialization * * * ## Initialization and vector creation ## * * After we initialize the library we can create a vector with complex properties * with the following line * * \snippet Vector/4_complex_prop/main.cpp vect create * * In this this particular case every particle carry two my_struct object * */ // initialize the library openfpm_init(&argc,&argv); // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y Box<2,float> domain({0.0,0.0},{1.0,1.0}); // Here we define the boundary conditions of our problem size_t bc[2]={PERIODIC,PERIODIC}; // extended boundary around the domain, and the processor domain Ghost<2,float> g(0.01); // my_struct at position 0 in the aggregate constexpr int my_s1 = 0; // my_struct at position 1 in the aggregate constexpr int my_s2 = 1; //! \cond [vect create] \endcond vector_dist<2,float, aggregate<my_struct,my_struct>> vd(4096,domain,bc,g); std::cout << "HAS PACK: " << has_pack_agg<aggregate<my_struct,my_struct>>::result::value << std::endl; //! \cond [vect create] \endcond /*! * * \page Vector_4_complex_prop_ser Vector 4 property serialization * * * ## Assign values to properties ## * * In this loop we assign position to particles and we fill the two my_struct * that each particle contain. As demostration the first my_struct is filled * with the string representation of the particle coordinates. The second my struct * is filled with the string representation of the particle position multiplied by 2.0. * The the vectors of the two my_struct are filled respectively with the sequence * 1,2,3 and 1,2,3,4 * * * * \snippet Vector/4_complex_prop/main_ser.cpp vect assign * */ //! \cond [vect assign] \endcond auto it = vd.getDomainIterator(); while (it.isNext()) { auto p = it.get(); // we define x, assign a random position between 0.0 and 1.0 vd.getPos(p)[0] = (float)rand() / RAND_MAX; // we define y, assign a random position between 0.0 and 1.0 vd.getPos(p)[1] = (float)rand() / RAND_MAX; // Get the particle position as point Point<2,float> pt = vd.getPos(p); // create a C string from the particle coordinates // and copy into my struct vd.getProp<my_s1>(p).size = 32; vd.getProp<my_s1>(p).ptr = new char[32]; strcpy(vd.getProp<my_s1>(p).ptr,pt.toString().c_str()); // create a C++ string from the particle coordinates vd.getProp<my_s1>(p).str = std::string(pt.toString()); vd.getProp<my_s1>(p).v.add(1); vd.getProp<my_s1>(p).v.add(2); vd.getProp<my_s1>(p).v.add(3); pt = pt * 2.0; // create a C string from the particle coordinates multiplied by 2.0 // and copy into my struct vd.getProp<my_s2>(p).size = 32; vd.getProp<my_s2>(p).ptr = new char[32]; strcpy(vd.getProp<my_s2>(p).ptr,pt.toString().c_str()); // create a C++ string from the particle coordinates vd.getProp<my_s2>(p).str = std::string(pt.toString()); vd.getProp<my_s2>(p).v.add(1); vd.getProp<my_s2>(p).v.add(2); vd.getProp<my_s2>(p).v.add(3); vd.getProp<my_s2>(p).v.add(4); // next particle ++it; } //! \cond [vect assign] \endcond /*! * * \page Vector_4_complex_prop_ser Vector 4 property serialization * * * ## Mapping and ghost_get ## * * Particles are redistributed across processors and we also synchronize the ghost * * \see \ref e0_s_map * * \see \ref e1_part_ghost * * \snippet Vector/4_complex_prop/main_ser.cpp vect map ghost * */ //! \cond [vect map ghost] \endcond // Particles are redistribued across the processors vd.map(); // Synchronize the ghost vd.ghost_get<my_s1,my_s2>(); //! \cond [vect map ghost] \endcond /*! * * \page Vector_4_complex_prop_ser Vector 4 property serialization * * * ## Output and VTK visualization ## * * Vector with complex properties can be still be visualized, because unknown properties are * automatically excluded * * \see \ref e0_s_vis_vtk * * \snippet Vector/4_complex_prop/main.cpp vtk * */ //! \cond [vtk] \endcond vd.write("particles"); //! \cond [vtk] \endcond /*! * * \page Vector_4_complex_prop_ser Vector 4 property serialization * * ## Print 4 particles in the ghost area ## * * Here we print that the first 4 particles to show that the two my_struct contain the * right information * * \snippet Vector/4_complex_prop/main_ser.cpp print ghost info * */ //! \cond [print ghost info] \endcond size_t fg = vd.size_local(); Vcluster & v_cl = create_vcluster(); // Only the master processor print if (v_cl.getProcessUnitID() == 0) { // Print 4 particles for ( ; fg < vd.size_local()+4 ; fg++) { // Print my struct1 information std::cout << "my_struct1:" << std::endl; std::cout << "C-string: " << vd.getProp<my_s1>(fg).ptr << std::endl; std::cout << "Cpp-string: " << vd.getProp<my_s1>(fg).str << std::endl; for (size_t i = 0 ; i < vd.getProp<my_s1>(fg).v.size() ; i++) std::cout << "Element: " << i << " " << vd.getProp<my_s1>(fg).v.get(i) << std::endl; // Print my struct 2 information std::cout << "my_struct2" << std::endl; std::cout << "C-string: " << vd.getProp<my_s2>(fg).ptr << std::endl; std::cout << "Cpp-string: " << vd.getProp<my_s2>(fg).str << std::endl; for (size_t i = 0 ; i < vd.getProp<my_s2>(fg).v.size() ; i++) std::cout << "Element: " << i << " " << vd.getProp<my_s2>(fg).v.get(i) << std::endl; } } //! \cond [print ghost info] \endcond /*! * \page Vector_4_complex_prop_ser Vector 4 property serialization * * ## Finalize ## {#finalize} * * At the very end of the program we have always to de-initialize the library * * \snippet Vector/4_complex_prop/main_ser.cpp finalize * */ //! \cond [finalize] \endcond openfpm_finalize(); //! \cond [finalize] \endcond /*! * \page Vector_4_complex_prop_ser Vector 4 property serialization * * # Full code # {#code} * * \include Vector/4_complex_prop/main_ser.cpp * */ }
template<typename solver_type,typename lid_nn> void lid_driven_cavity_2d() { Vcluster & v_cl = create_vcluster(); if (v_cl.getProcessingUnits() > 3) return; //! [lid-driven cavity 2D] // velocity in the grid is the property 0, pressure is the property 1 constexpr int velocity = 0; constexpr int pressure = 1; // Domain, a rectangle Box<2,float> domain({0.0,0.0},{3.0,1.0}); // Ghost (Not important in this case but required) Ghost<2,float> g(0.01); // Grid points on x=256 and y=64 long int sz[] = {256,64}; size_t szu[2]; szu[0] = (size_t)sz[0]; szu[1] = (size_t)sz[1]; // We need one more point on the left and down part of the domain // This is given by the boundary conditions that we impose, the // reason is mathematical in order to have a well defined system // and cannot be discussed here Padding<2> pd({1,1},{0,0}); // Distributed grid that store the solution grid_dist_id<2,float,aggregate<float[2],float>,CartDecomposition<2,float>> g_dist(szu,domain,g); // It is the maximum extension of the stencil Ghost<2,long int> stencil_max(1); // Finite difference scheme FDScheme<lid_nn> fd(pd, stencil_max, domain, g_dist.getGridInfo(), g_dist); // Here we impose the equation, we start from the incompressibility Eq imposed in the bulk with the // exception of the first point {0,0} and than we set P = 0 in {0,0}, why we are doing this is again // mathematical to have a well defined system, an intuitive explanation is that P and P + c are both // solution for the incompressibility equation, this produce an ill-posed problem to make it well posed // we set one point in this case {0,0} the pressure to a fixed constant for convenience P = 0 fd.impose(ic_eq(),0.0, EQ_3, {0,0},{sz[0]-2,sz[1]-2},true); fd.impose(Prs(), 0.0, EQ_3, {0,0},{0,0}); // Here we impose the Eq1 and Eq2 fd.impose(vx_eq(),0.0, EQ_1, {1,0},{sz[0]-2,sz[1]-2}); fd.impose(vy_eq(),0.0, EQ_2, {0,1},{sz[0]-2,sz[1]-2}); // v_x and v_y // Imposing B1 fd.impose(v_x(),0.0, EQ_1, {0,0},{0,sz[1]-2}); fd.impose(avg_vy_f(),0.0, EQ_2 , {-1,0},{-1,sz[1]-1}); // Imposing B2 fd.impose(v_x(),0.0, EQ_1, {sz[0]-1,0},{sz[0]-1,sz[1]-2}); fd.impose(avg_vy(),1.0, EQ_2, {sz[0]-1,0},{sz[0]-1,sz[1]-1}); // Imposing B3 fd.impose(avg_vx_f(),0.0, EQ_1, {0,-1},{sz[0]-1,-1}); fd.impose(v_y(), 0.0, EQ_2, {0,0},{sz[0]-2,0}); // Imposing B4 fd.impose(avg_vx(),0.0, EQ_1, {0,sz[1]-1},{sz[0]-1,sz[1]-1}); fd.impose(v_y(), 0.0, EQ_2, {0,sz[1]-1},{sz[0]-2,sz[1]-1}); // When we pad the grid, there are points of the grid that are not // touched by the previous condition. Mathematically this lead // to have too many variables for the conditions that we are imposing. // Here we are imposing variables that we do not touch to zero // // Padding pressure fd.impose(Prs(), 0.0, EQ_3, {-1,-1},{sz[0]-1,-1}); fd.impose(Prs(), 0.0, EQ_3, {-1,sz[1]-1},{sz[0]-1,sz[1]-1}); fd.impose(Prs(), 0.0, EQ_3, {-1,0},{-1,sz[1]-2}); fd.impose(Prs(), 0.0, EQ_3, {sz[0]-1,0},{sz[0]-1,sz[1]-2}); // Impose v_x Padding Impose v_y padding fd.impose(v_x(), 0.0, EQ_1, {-1,-1},{-1,sz[1]-1}); fd.impose(v_y(), 0.0, EQ_2, {-1,-1},{sz[0]-1,-1}); solver_type solver; auto x = solver.solve(fd.getA(),fd.getB()); //! [lid-driven cavity 2D] //! [Copy the solution to grid] fd.template copy<velocity,pressure>(x,{0,0},{sz[0]-1,sz[1]-1},g_dist); std::string s = std::string(demangle(typeid(solver_type).name())); s += "_"; //! [Copy the solution to grid] g_dist.write(s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid"); #ifdef HAVE_OSX std::string file1 = std::string("test/") + s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid_" + std::to_string(v_cl.getProcessUnitID()) + "_test_osx.vtk"; std::string file2 = s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid_" + std::to_string(v_cl.getProcessUnitID()) + ".vtk"; #else #if __GNUC__ == 5 std::string file1 = std::string("test/") + s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid_" + std::to_string(v_cl.getProcessUnitID()) + "_test_GCC5.vtk"; std::string file2 = s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid_" + std::to_string(v_cl.getProcessUnitID()) + ".vtk"; #else std::string file1 = std::string("test/") + s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid_" + std::to_string(v_cl.getProcessUnitID()) + "_test_GCC4.vtk"; std::string file2 = s + "lid_driven_cavity_p" + std::to_string(v_cl.getProcessingUnits()) + "_grid_" + std::to_string(v_cl.getProcessUnitID()) + ".vtk"; #endif #endif std::cout << "File1: " << file1 << std::endl; std::cout << "File2: " << file2 << std::endl; // Check that match bool test = compare(file1,file2); BOOST_REQUIRE_EQUAL(test,true); }
int main(int argc, char* argv[]) { // // ### WIKI 2 ### // // Here we Initialize the library, than we create a uniform random generator between 0 and 1 to to generate particles // randomly in the domain, we create a Box that define our domain, boundary conditions, and ghost // // initialize the library openfpm_init(&argc,&argv); // Here we define our domain a 2D box with internals from 0 to 1.0 for x and y Box<2,float> domain({0.0,0.0},{1.0,1.0}); // Here we define the boundary conditions of our problem size_t bc[2]={PERIODIC,PERIODIC}; // extended boundary around the domain, and the processor domain Ghost<2,float> g(0.01); // // ### WIKI 3 ### // // Here we are creating a distributed vector defined by the following parameters // // * 2 is the Dimensionality of the space where the objects live // * float is the type used for the spatial coordinate of the particles // * float,float[3],float[3][3] is the information stored by each particle a scalar float, a vector float[3] and a tensor of rank 2 float[3][3] // the list of properties must be put into an aggregate data astructure aggregate<prop1,prop2,prop3, ... > // // vd is the instantiation of the object // // The Constructor instead require: // // * Number of particles 4096 in this case // * Domain where is defined this structure // * bc boundary conditions // * g Ghost // // The following construct a vector where each processor has 4096 / N_proc (N_proc = number of processor) // objects with an undefined position in space. This non-space decomposition is also called data-driven // decomposition // vector_dist<2,float, aggregate<float,float[3],float[3][3]> > vd(4096,domain,bc,g); // the scalar is the element at position 0 in the aggregate const int scalar = 0; // the vector is the element at position 1 in the aggregate const int vector = 1; // the tensor is the element at position 2 in the aggregate const int tensor = 2; // // ### WIKI 5 ### // // Get an iterator that go through the 4096 particles, in an undefined position state and define its position // auto it = vd.getDomainIterator(); while (it.isNext()) { auto key = it.get(); // we define x, assign a random position between 0.0 and 1.0 vd.getPos(key)[0] = rand() / RAND_MAX; // we define y, assign a random position between 0.0 and 1.0 vd.getPos(key)[1] = rand() / RAND_MAX; // next particle ++it; } // // ### WIKI 6 ### // // Once we define the position, we distribute them according to the default space decomposition // The default decomposition is created even before assigning the position to the object. It determine // which part of space each processor manage // vd.map(); // // ### WIKI 7 ### // // We get the object that store the decomposition, than we iterate again across all the objects, we count them // and we confirm that all the particles are local // //Counter we use it later size_t cnt = 0; // Get the space decomposition auto & ct = vd.getDecomposition(); // Get a particle iterator it = vd.getDomainIterator(); // For each particle ... while (it.isNext()) { // ... p auto p = it.get(); // we set the properties of the particle p // the scalar property vd.template getProp<scalar>(p) = 1.0; vd.template getProp<vector>(p)[0] = 1.0; vd.template getProp<vector>(p)[1] = 1.0; vd.template getProp<vector>(p)[2] = 1.0; vd.template getProp<tensor>(p)[0][0] = 1.0; vd.template getProp<tensor>(p)[0][1] = 1.0; vd.template getProp<tensor>(p)[0][2] = 1.0; vd.template getProp<tensor>(p)[1][0] = 1.0; vd.template getProp<tensor>(p)[1][1] = 1.0; vd.template getProp<tensor>(p)[1][2] = 1.0; vd.template getProp<tensor>(p)[2][0] = 1.0; vd.template getProp<tensor>(p)[2][1] = 1.0; vd.template getProp<tensor>(p)[2][2] = 1.0; // increment the counter cnt++; // next particle ++it; } // // ### WIKI 8 ### // // cnt contain the number of object the local processor contain, if we are interested to count the total number across the processor // we can use the function add, to sum across processors. First we have to get an instance of Vcluster, queue an operation of add with // the variable count and finaly execute. All the operations are asynchronous, execute work like a barrier and ensure that all the // queued operations are executed // auto & v_cl = create_vcluster(); v_cl.sum(cnt); v_cl.execute(); // // ### WIKI 9 ### // // Output the particle position for each processor // vd.write("output",VTK_WRITER); // // ### WIKI 10 ### // // Deinitialize the library // openfpm_finalize(); }
int main(int argc, char* argv[]) { /*! * * \page Vector_4_comp_reo Vector 4 computational reordering and cache friendliness * * ## Initialization ## * * The initialization is the same as the molecular dynamic example. The differences are in the * parameters. We will use a bigger system, with more particles. The delta time for integration * is chosen in order to keep the system stable. * * \see \ref e3_md_init * * \snippet Vector/4_reorder/main_comp_ord.cpp vect create * */ //! \cond [vect create] \endcond double dt = 0.0001; float r_cut = 0.03; double sigma = r_cut/3.0; double sigma12 = pow(sigma,12); double sigma6 = pow(sigma,6); openfpm::vector<double> x; openfpm::vector<openfpm::vector<double>> y; openfpm_init(&argc,&argv); Vcluster & v_cl = create_vcluster(); // we will use it do place particles on a 40x40x40 Grid like size_t sz[3] = {40,40,40}; // domain Box<3,float> box({0.0,0.0,0.0}, {1.0,1.0,1.0}); // Boundary conditions size_t bc[3]= {PERIODIC,PERIODIC,PERIODIC}; // ghost, big enough to contain the interaction radius Ghost<3,float> ghost(r_cut); vector_dist<3,double, aggregate<double[3],double[3]> > vd(0,box,bc,ghost); //! \cond [vect create] \endcond /*! * \page Vector_4_comp_reo Vector 4 computational reordering and cache friendliness * * ## Particles on a grid like position ## * * Here we place the particles on a grid like manner * * \see \ref e3_md_gl * * \snippet Vector/4_reorder/main_comp_ord.cpp vect grid * */ //! \cond [vect grid] \endcond auto it = vd.getGridIterator(sz); while (it.isNext()) { vd.add(); auto key = it.get(); vd.getLastPos()[0] = key.get(0) * it.getSpacing(0); vd.getLastPos()[1] = key.get(1) * it.getSpacing(1); vd.getLastPos()[2] = key.get(2) * it.getSpacing(2); vd.template getLastProp<velocity>()[0] = 0.0; vd.template getLastProp<velocity>()[1] = 0.0; vd.template getLastProp<velocity>()[2] = 0.0; vd.template getLastProp<force>()[0] = 0.0; vd.template getLastProp<force>()[1] = 0.0; vd.template getLastProp<force>()[2] = 0.0; ++it; } //! \cond [vect grid] \endcond /*! * * \page Vector_4_comp_reo Vector 4 computational reordering and cache friendliness * * ## Molecular dynamic steps ## * * Here we do 30000 MD steps using verlet integrator the cycle is the same as the * molecular dynamic example. with the following changes. * * ### Cell lists ### * * Instead of getting the normal cell list we get an hilbert curve cell-list. Such cell list has a * function called **getIterator** used inside the function **calc_forces** and **calc_energy** * that iterate across all the particles but in a smart-way. In practice * given an r-cut a cell-list is constructed with the provided spacing. Suppose to have a cell-list * \f$ m \times n \f$, an hilbert curve \f$ 2^k \times 2^k \f$ is contructed with \f$ k = ceil(log_2(max(m,n))) \f$. * Cell-lists are explored according to this Hilbert curve, If a cell does not exist is simply skipped. * * * \verbatim +------+------+------+------+ Example of Hilbert curve running on a 3 x 3 Cell | | | | | An hilbert curve of k = ceil(log_2(3)) = 4 | X+---->X | X +---> X | | ^ | + | ^ | + | ***|******|******|****---|--+ ******* * + | v | + * v | * * * 7 | 8+---->9 * X | * * = Domain * ^ | | * + | * * *--|-----------------*---|--+ ******* * + | | * v | * 4<----+5 | 6<---+ X | * | ^ | + * | *---------|-------|--*------+ * | + | v * | * 1+---->2 | 3+---> X | * | | * | **********************------+ this mean that we will iterate the following cells 1,2,5,4,7,8,9,6,3 Suppose now that the particles are ordered like described Particles id Cell 0 1 1 7 2 8 3 1 4 9 5 9 6 6 7 7 8 3 9 2 10 4 11 3 The iterator of the cell-list will explore the particles in the following way Cell 1 2 5 4 7 8 9 6 3 | | | | | | | | | | 0,3,9,,10,1,7,2,4,5,6,8 * \endverbatim * * We cannot explain here what is a cache, but in practice is a fast memory in the CPU able * to store chunks of memory. The cache in general is much smaller than RAM, but the big advantage * is its speed. Retrieve data from the cache is much faster than RAM. Unfortunately the factors * that determine what is on cache and what is not are multiples: Type of cache, algorithm ... . * Qualitatively all caches will tend to load chunks of data that you read multiple-time, or chunks * of data that probably you will read based on pattern analysis. A small example is a linear memory copy where * you read consecutively memory and you write on consecutive memory. * Modern CPU recognize such pattern and decide to load on cache the consecutive memory before * you actually require it. * * * Iterating the vector in the way described above has the advantage that when we do computation on particles * and its neighborhood with the sequence described above it will happen that: * * * If to process a particle A we read some neighborhood particles to process the next particle A+1 * we will probably read most of the previous particles. * * * In order to show in practice what happen we first show the graph when we do not reorder * * \htmlinclude Vector/4_reorder/no_reorder.html * * The measure has oscillation but we see an asymptotic behavior from 0.04 in the initial condition to * 0.124 . Below we show what happen when we use iterator from the Cell list hilbert * * \htmlinclude Vector/4_reorder/comp_reord.html * * In cases where particles does not move or move very slowly consider to use data-reordering, because it can * give **8-10% speedup** * * \see \ref e4_reo * * ## Timers ## * * In order to collect the time of the force calculation we insert two timers around the function * calc_force. The overall performance is instead calculated with another timer around the time stepping * * \snippet Vector/4_reorder/main_data_ord.cpp timer start * \snippet Vector/4_reorder/main_data_ord.cpp timer stop * * \see \ref e3_md_vi * * */ //! \cond [md steps] \endcond // Get the Cell list structure auto NN = vd.getCellList_hilb(r_cut); // calculate forces calc_forces(vd,NN,sigma12,sigma6); unsigned long int f = 0; timer time2; time2.start(); #ifndef TEST_RUN size_t Nstep = 30000; #else size_t Nstep = 300; #endif // MD time stepping for (size_t i = 0; i < Nstep ; i++) { // Get the iterator auto it3 = vd.getDomainIterator(); // integrate velicity and space based on the calculated forces (Step1) while (it3.isNext()) { auto p = it3.get(); // here we calculate v(tn + 0.5) vd.template getProp<velocity>(p)[0] += 0.5*dt*vd.template getProp<force>(p)[0]; vd.template getProp<velocity>(p)[1] += 0.5*dt*vd.template getProp<force>(p)[1]; vd.template getProp<velocity>(p)[2] += 0.5*dt*vd.template getProp<force>(p)[2]; // here we calculate x(tn + 1) vd.getPos(p)[0] += vd.template getProp<velocity>(p)[0]*dt; vd.getPos(p)[1] += vd.template getProp<velocity>(p)[1]*dt; vd.getPos(p)[2] += vd.template getProp<velocity>(p)[2]*dt; ++it3; } // Because we mooved the particles in space we have to map them and re-sync the ghost vd.map(); vd.template ghost_get<>(); timer time; if (i % 10 == 0) time.start(); // calculate forces or a(tn + 1) Step 2 calc_forces(vd,NN,sigma12,sigma6); if (i % 10 == 0) { time.stop(); x.add(i); y.add({time.getwct()}); } // Integrate the velocity Step 3 auto it4 = vd.getDomainIterator(); while (it4.isNext()) { auto p = it4.get(); // here we calculate v(tn + 1) vd.template getProp<velocity>(p)[0] += 0.5*dt*vd.template getProp<force>(p)[0]; vd.template getProp<velocity>(p)[1] += 0.5*dt*vd.template getProp<force>(p)[1]; vd.template getProp<velocity>(p)[2] += 0.5*dt*vd.template getProp<force>(p)[2]; ++it4; } // After every iteration collect some statistic about the confoguration if (i % 100 == 0) { // We write the particle position for visualization (Without ghost) vd.deleteGhost(); vd.write("particles_",f); // we resync the ghost vd.ghost_get<>(); // We calculate the energy double energy = calc_energy(vd,NN,sigma12,sigma6); auto & vcl = create_vcluster(); vcl.sum(energy); vcl.execute(); // We also print on terminal the value of the energy // only one processor (master) write on terminal if (vcl.getProcessUnitID() == 0) std::cout << std::endl << "Energy: " << energy << std::endl; f++; } } time2.stop(); std::cout << "Performance: " << time2.getwct() << std::endl; //! \cond [md steps] \endcond /*! * \page Vector_4_comp_reo Vector 4 computational reordering and cache friendliness * * ## Plotting graphs ## * * After we terminate the MD steps our vector x contains at which iteration we benchmark the force * calculation time, while y contains the measured time at that time-step. We can produce a graph X Y * * \note The graph produced is an svg graph that can be view with a browser. From the browser we can * also easily save the graph into pure svg format * * \snippet Vector/4_reorder/main_comp_ord.cpp google chart * */ //! \cond [google chart] \endcond // Google charts options, it store the options to draw the X Y graph GCoptions options; // Title of the graph options.title = std::string("Force calculation time"); // Y axis name options.yAxis = std::string("Time"); // X axis name options.xAxis = std::string("iteration"); // width of the line options.lineWidth = 1.0; // Object that draw the X Y graph GoogleChart cg; // Add the graph // The graph that it produce is in svg format that can be opened on browser cg.AddLinesGraph(x,y,options); // Write into html format cg.write("gc_plot2_out.html"); //! \cond [google chart] \endcond /*! * * \page Vector_4_comp_reo Vector 4 computational reordering and cache friendliness * * ## Finalize ## * * At the very end of the program we have always to de-initialize the library * * \snippet Vector/4_reorder/main_comp_ord.cpp finalize * */ //! \cond [finalize] \endcond openfpm_finalize(); //! \cond [finalize] \endcond /*! * * \page Vector_4_comp_reo Vector 4 computational reordering and cache friendliness * * # Full code # * * \include Vector/4_reorder/main_comp_ord.cpp * */ }
int main(int argc, char* argv[]) { /*! * \page Plot_0_cg Plot 0 Google Chart * * ## Initialization ## * * Here we Initialize the library, and we check the we are on a single processor. GoogleChart * cannot do parallel IO or write big files. So or we collect all data on one processor, or each * processor write a distinct file. In this particular example we simply stop if the program start * on more than one processor * * \snippet Plot/0_simple_graph/main.cpp initialize * */ //! \cond [initialize] \endcond openfpm_init(&argc,&argv); auto & v_cl = create_vcluster(); // Google chart is only single processor if (v_cl.getProcessingUnits() > 1) { std::cerr << "Error: only one processor is allowed" << "\n"; return 1; } //! \cond [initialize] \endcond /*! * \page Plot_0_cg Plot 0 Google Chart * * ## Graph data ## * * Here we have the vectors that will contain the information about the graph. * * \snippet Plot/0_simple_graph/main.cpp datas vector * */ //! \cond [datas vector] \endcond openfpm::vector<std::string> x; openfpm::vector<openfpm::vector<double>> y; openfpm::vector<std::string> yn; //! \cond [datas vector] \endcond /*! * \page Plot_0_cg Plot 0 Google Chart * * We will try now to produce the following situation. Six values on **x** each of them having 4 values on **y** * * This mean that for each x value we have to define 4 y values. Having multiple values on on x can be used for * several purpose. * * * Define multiple lines. For example if we connect all the points # we obtain one line. If we connect * all the @ points we obtain another line, an so on ... (figure below) * * * Define error bands * * * Visualize different observables/parameters for the same value x * * * \verbatim y ^ $ dataset1 | * dataset2 0.9 | # dataset3 | @ @ dataset4 | # 0.6 | * * @ * | $ # @ # # | @ $ $ @ @ 0.3 | # * $ # | $ * | * $ 0 |_________________________________ o t t f f s x n w h o i i e o r u v x e r e e \endverbatim * * We start from the first case (Define multiple lines) * * \snippet Plot/0_simple_graph/main.cpp data fill * */ //! \cond [data fill] \endcond // Fill the x values x.add("one"); x.add("two"); x.add("three"); x.add("four"); x.add("five"); x.add("six"); // we have 4 dataset or lines yn.add("dataset1"); yn.add("dataset2"); yn.add("dataset3"); yn.add("dataset4"); // Because we have 6 points on x each containing 4 lines or dataset, we have to provides // 6 point with 4 values at each x point y.add({2,3,5,6}); y.add({5,6,1,6}); y.add({2,1,6,9}); y.add({1,6,3,2}); y.add({3,3,0,6}); y.add({2,1,4,6}); //! \cond [data fill] \endcond /*! * \page Plot_0_cg Plot 0 Google Chart * * ## Graph options ## * * We can specify several options for the graphs. * * * Title of the graph * * Title of the y axis * * Title of the x axis * * * \snippet Plot/0_simple_graph/main.cpp google chart * */ //! \cond [google chart] \endcond GCoptions options; options.title = std::string("Example"); options.yAxis = std::string("Y Axis"); options.xAxis = std::string("X Axis"); options.lineWidth = 5; //! \cond [google chart] \endcond /*! * \page Plot_0_cg Plot 0 Google Chart * * ## Graph write ## * * We create the object to create plots with Google Charts * * A writer can produce several graphs optionally interleaved with HTML code. * Here we write in HTML a description of the graph, than we output the graph * * AddLinesGraph create a typical graph with lines * * \snippet Plot/0_simple_graph/main.cpp google chart write1 * * \htmlonly <div id="chart_div0" style="width: 900px; height: 500px;"></div> \endhtmlonly * */ //! \cond [google chart write1] \endcond GoogleChart cg; // cg.addHTML("<h2>First graph</h2>"); cg.AddLinesGraph(x,y,yn,options); //! \cond [google chart write1] \endcond /*! * * \page Plot_0_cg Plot 0 Google Chart * * ## Hist graph ## * * Hist graph is instead a more flexible Graph writer. In particular we can specify * how to draw each dataset. With the option * * * **stype** specify how to draw each dataset * * **stypeext** we can override the default stype option. In this case we say that the third dataset * in must be reppresented as a line instead of a bars * * To note that we can reuse the same Google chart writer to write multiple * Graph on the same page, interleaved with HTML code * * \snippet Plot/0_simple_graph/main.cpp google chart write2 * * \htmlonly <div id="chart_div1" style="width: 900px; height: 500px;"></div> \endhtmlonly * * */ //! \cond [google chart write2] \endcond options.stype = std::string("bars"); // it say that the dataset4 must me represented with a line options.stypeext = std::string("{3: {type: 'line'}}"); cg.addHTML("<h2>Second graph</h2>"); cg.AddHistGraph(x,y,yn,options); //! \cond [google chart write2] \endcond /*! * * \page Plot_0_cg Plot 0 Google Chart * * ## %Error bars ## * * Here we show how to draw error bars. %Error bars are drawn specifying intervals with a min and a max. * Intervals in general does not have to encapsulate any curve. First we construct the vector y with 3 * values the first value contain the curve points, the second and third contain the min,max interval. * * \snippet Plot/0_simple_graph/main.cpp google chart write3 * * \htmlonly <div id="chart_div2" style="width: 900px; height: 500px;"></div> \endhtmlonly * * */ //! \cond [google chart write3] \endcond cg.addHTML("<h2>Third graph</h2>"); // The first colum are the values of a line while the other 2 values // are the min and max of an interval, as we can see interval does not // have to encapsulate any curve y.clear(); y.add({0.10,0.20,0.19}); y.add({0.11,0.21,0.18}); y.add({0.12,0.22,0.21}); y.add({0.15,0.25,0.20}); y.add({0.09,0.29,0.25}); y.add({0.08,0.28,0.27}); // Here we mark that the the colum 2 and 3 are intervals yn.clear(); yn.add("line1"); yn.add("interval"); yn.add("interval"); cg.AddLinesGraph(x,y,yn,options); //! \cond [google chart write3] \endcond /*! * * \page Plot_0_cg Plot 0 Google Chart * * The style of each interval can be controlled, and the definition of intervals can be interleaved with definition of * other lines. In this example we show how to define 3 lines and 3 intervals, controlling the style of the last interval * * \snippet Plot/0_simple_graph/main.cpp google chart write4 * * \htmlonly <div id="chart_div3" style="width: 900px; height: 500px;"></div> \endhtmlonly * * */ //! \cond [google chart write4] \endcond cg.addHTML("<h2>Four graph</h2>"); // again 6 point but 9 values y.clear(); y.add({0.10,0.20,0.19,0.22,0.195,0.215,0.35,0.34,0.36}); y.add({0.11,0.21,0.18,0.22,0.19,0.215,0.36,0.35,0.37}); y.add({0.12,0.22,0.21,0.23,0.215,0.225,0.35,0.34,0.36}); y.add({0.15,0.25,0.20,0.26,0.22,0.255,0.36,0.35,0.37}); y.add({0.09,0.29,0.25,0.30,0.26,0.295,0.35,0.34,0.36}); y.add({0.08,0.28,0.27,0.29,0.275,0.285,0.36,0.35,0.37}); // colum 0 and 1 are lines // colums 2-3 and 4-5 are intervals // colum 6 is a line // colum 7-8 is an interval yn.add("line1"); yn.add("line2"); yn.add("interval"); yn.add("interval"); yn.add("interval"); yn.add("interval"); yn.add("line3"); yn.add("interval"); yn.add("interval"); // Intervals are enumerated with iX, for example in this case with 3 intervals we have i0,i1,i2 // with this line we control the style of the intervals. In particular we change from the default // values options.intervalext = std::string("{'i2': { 'color': '#4374E0', 'style':'bars', 'lineWidth':4, 'fillOpacity':1 } }"); cg.AddLinesGraph(x,y,yn,options); //! \cond [google chart write4] \endcond /*! * * \page Plot_0_cg Plot 0 Google Chart * * ## More options ## * * In this last example we also show how to: * * * * Make the graph bigger, setting **width** and **height** options * * Give the possibility to to zoom-in and zoom-out with **GC_EXPLORER** * * Use lines instead a smooth function to connect points * * Use logaritmic scale * * \note For more options refer to doxygen and Google Charts * * \snippet Plot/0_simple_graph/main.cpp google chart write5 * * * \htmlonly <div id="chart_div4" style="width: 1280px; height: 700px;"></div> \endhtmlonly * */ //! \cond [google chart write5] \endcond openfpm::vector<double> xn; xn.add(1.0); xn.add(2.0); xn.add(3.0); xn.add(4.0); xn.add(5.0); xn.add(6.0); options.intervalext = ""; options.width = 1280; options.heigh = 720; options.curveType = "line"; options.more = GC_ZOOM + "," + GC_X_LOG + "," + GC_Y_LOG; cg.AddLinesGraph(xn,y,yn,options); cg.write("gc_out.html"); //! \cond [google chart write5] \endcond /*! * \page Plot_0_cg Plot 0 Google Chart * * * ## Finalize ## {#finalize} * * At the very end of the program we have always de-initialize the library * * \snippet Plot/0_simple_graph/main.cpp finalize * */ //! \cond [finalize] \endcond openfpm_finalize(); //! \cond [finalize] \endcond }
int main(int argc, char* argv[]) { // // ### WIKI 2 ### // // Initialize the library and several objects // openfpm_init(&argc,&argv); // // ### WIKI 3 ### // // Get the vcluster object and the number of processor // Vcluster & v_cl = create_vcluster(); size_t N_prc = v_cl.getProcessingUnits(); // // ### WIKI 3 ### // // We find the maximum of the processors rank, that should be the Number of // processora minus one, only processor 0 print on terminal // size_t id = v_cl.getProcessUnitID(); v_cl.max(id); v_cl.execute(); if (v_cl.getProcessUnitID() == 0) std::cout << "Maximum processor rank: " << id << "\n"; // // ### WIKI 4 ### // // We sum all the processor ranks the maximum, the result should be that should // be $\frac{(n-1)n}{2}$, only processor 0 print on terminal // size_t id2 = v_cl.getProcessUnitID(); v_cl.sum(id2); v_cl.execute(); if (v_cl.getProcessUnitID() == 0) std::cout << "Sum of all processors rank: " << id2 << "\n"; // // ### WIKI 5 ### // // we can collect information from all processors using the function gather // size_t id3 = v_cl.getProcessUnitID(); openfpm::vector<size_t> v; v_cl.allGather(id3,v); v_cl.execute(); if (v_cl.getProcessUnitID() == 0) { std::cout << "Collected ids: "; for(size_t i = 0 ; i < v.size() ; i++) std::cout << " " << v.get(i) << " "; std::cout << "\n"; } // // ### WIKI 5 ### // // we can also send messages to specific processors, with the condition that the receiving // processors know we want to communicate with them, if you are searching for a more // free way to communicate where the receiving processors does not know which one processor // want to communicate with us, see the example 1_dsde // std::stringstream ss_message_1; std::stringstream ss_message_2; ss_message_1 << "Hello from " << std::setw(8) << v_cl.getProcessUnitID() << "\n"; ss_message_2 << "Hello from " << std::setw(8) << v_cl.getProcessUnitID() << "\n"; std::string message_1 = ss_message_1.str(); std::string message_2 = ss_message_2.str(); size_t msg_size = message_1.size(); // Processor 0 send to processors 1,2 , 1 to 2,1, 2 to 0,1 v_cl.send(((id3+1)%N_prc + N_prc)%N_prc,0,message_1.c_str(),msg_size); v_cl.send(((id3+2)%N_prc + N_prc)%N_prc,0,message_2.c_str(),msg_size); openfpm::vector<char> v_one; v_one.resize(msg_size); openfpm::vector<char> v_two(msg_size); v_two.resize(msg_size); v_cl.recv(((id3-1)%N_prc + N_prc)%N_prc,0,(void *)v_one.getPointer(),msg_size); v_cl.recv(((id3-2)%N_prc + N_prc)%N_prc,0,(void *)v_two.getPointer(),msg_size); v_cl.execute(); if (v_cl.getProcessUnitID() == 0) { for (size_t i = 0 ; i < msg_size ; i++) std::cout << v_one.get(i); for (size_t i = 0 ; i < msg_size ; i++) std::cout << v_two.get(i); } // // ### WIKI 5 ### // // we can also do what we did before in one shot // id = v_cl.getProcessUnitID(); id2 = v_cl.getProcessUnitID(); id3 = v_cl.getProcessUnitID(); v.clear(); // convert the string into a vector openfpm::vector<char> message_1_v(msg_size); openfpm::vector<char> message_2_v(msg_size); for (size_t i = 0 ; i < msg_size ; i++) message_1_v.get(i) = message_1[i]; for (size_t i = 0 ; i < msg_size ; i++) message_2_v.get(i) = message_2[i]; v_cl.max(id); v_cl.sum(id2); v_cl.allGather(id3,v); // in the case of vector we have special functions that avoid to specify the size v_cl.send(((id+1)%N_prc + N_prc)%N_prc,0,message_1_v); v_cl.send(((id+2)%N_prc + N_prc)%N_prc,0,message_2_v); v_cl.recv(((id-1)%N_prc + N_prc)%N_prc,0,v_one); v_cl.recv(((id-2)%N_prc + N_prc)%N_prc,0,v_two); v_cl.execute(); if (v_cl.getProcessUnitID() == 0) { std::cout << "Maximum processor rank: " << id << "\n"; std::cout << "Sum of all processors rank: " << id << "\n"; std::cout << "Collected ids: "; for(size_t i = 0 ; i < v.size() ; i++) std::cout << " " << v.get(i) << " "; std::cout << "\n"; for (size_t i = 0 ; i < msg_size ; i++) std::cout << v_one.get(i); for (size_t i = 0 ; i < msg_size ; i++) std::cout << v_two.get(i); } openfpm_finalize(); }
template<unsigned int ip> void test() { Vcluster & vcl = create_vcluster(); // send/recv messages global_rank = vcl.getProcessUnitID(); size_t n_proc = vcl.getProcessingUnits(); // Checking short communication pattern for (size_t s = 0 ; s < N_TRY ; s++) { for (size_t j = 32 ; j < N_LOOP ; j*=2) { global_step = j; //! [dsde] // We send one message for each processor (one message is an openfpm::vector<unsigned char>) // or an array of bytes openfpm::vector<openfpm::vector<unsigned char>> message; // receving messages. Each receiving message is an openfpm::vector<unsigned char> // or an array if bytes openfpm::vector<openfpm::vector<unsigned char>> recv_message(n_proc); // each processor communicate based on a list of processor openfpm::vector<size_t> prc; // We construct the processor list in particular in this case // each processor communicate with the 8 next (in id) processors for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { size_t p_id = (i + 1 + vcl.getProcessUnitID()) % n_proc; // avoid to communicate with yourself if (p_id != vcl.getProcessUnitID()) { // Create an hello message prc.add(p_id); message.add(); std::ostringstream msg; msg << "Hello from " << vcl.getProcessUnitID() << " to " << p_id; std::string str(msg.str()); message.last().resize(j); memset(message.last().getPointer(),0,j); std::copy(str.c_str(),&(str.c_str())[msg.str().size()],&(message.last().get(0))); } } // For simplicity we create in advance a receiving buffer for all processors recv_message.resize(n_proc); // The pattern is not really random preallocate the receive buffer for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { long int p_id = vcl.getProcessUnitID() - i - 1; if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) recv_message.get(p_id).resize(j); } // Send and receive vcl.sendrecvMultipleMessagesNBX(prc,message,msg_alloc,&recv_message); //! [dsde] #ifdef VERBOSE_TEST timer t; t.start(); #endif #ifdef VERBOSE_TEST t.stop(); double clk = t.getwct(); double clk_max = clk; size_t size_send_recv = 2 * j * (prc.size()); vcl.sum(size_send_recv); vcl.max(clk_max); vcl.execute(); if (vcl.getProcessUnitID() == 0) std::cout << "(Short pattern: " << method<ip>() << ")Buffer size: " << j << " Bandwidth (Average): " << size_send_recv / vcl.getProcessingUnits() / clk / 1e6 << " MB/s " << " Bandwidth (Total): " << size_send_recv / clk / 1e6 << " MB/s Clock: " << clk << " Clock MAX: " << clk_max <<"\n"; #endif // Check the message for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { long int p_id = vcl.getProcessUnitID() - i - 1; if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) { std::ostringstream msg; msg << "Hello from " << p_id << " to " << vcl.getProcessUnitID(); std::string str(msg.str()); BOOST_REQUIRE_EQUAL(std::equal(str.c_str(),str.c_str() + str.size() ,&(recv_message.get(p_id).get(0))),true); } else { BOOST_REQUIRE_EQUAL((size_t)0,recv_message.get(p_id).size()); } } } std::srand(create_vcluster().getProcessUnitID()); std::default_random_engine eg; std::uniform_int_distribution<int> d(0,n_proc/8); // Check random pattern (maximum 16 processors) for (size_t j = 32 ; j < N_LOOP && n_proc < 16 ; j*=2) { global_step = j; // original send openfpm::vector<size_t> o_send; // send message openfpm::vector<openfpm::vector<unsigned char>> message; // recv message openfpm::vector<openfpm::vector<unsigned char>> recv_message; // recv_message.reserve(n_proc); openfpm::vector<size_t> prc; for (size_t i = 0 ; i < n_proc ; i++) { // randomly with which processor communicate if (d(eg) == 0) { prc.add(i); o_send.add(i); message.add(); message.last().fill(0); std::ostringstream msg; msg << "Hello from " << vcl.getProcessUnitID() << " to " << i; std::string str(msg.str()); message.last().resize(str.size()); std::copy(str.c_str(),&(str.c_str())[msg.str().size()],&(message.last().get(0))); message.last().resize(j); } } id = 0; prc_recv.clear(); #ifdef VERBOSE_TEST timer t; t.start(); #endif commFunc<ip>(vcl,prc,message,msg_alloc3,&recv_message); #ifdef VERBOSE_TEST t.stop(); double clk = t.getwct(); double clk_max = clk; size_t size_send_recv = (prc.size() + recv_message.size()) * j; vcl.sum(size_send_recv); vcl.sum(clk); vcl.max(clk_max); vcl.execute(); clk /= vcl.getProcessingUnits(); if (vcl.getProcessUnitID() == 0) std::cout << "(Random Pattern: " << method<ip>() << ") Buffer size: " << j << " Bandwidth (Average): " << size_send_recv / vcl.getProcessingUnits() / clk / 1e6 << " MB/s " << " Bandwidth (Total): " << size_send_recv / clk / 1e6 << " MB/s Clock: " << clk << " Clock MAX: " << clk_max << "\n"; #endif // Check the message for (size_t i = 0 ; i < recv_message.size() ; i++) { std::ostringstream msg; msg << "Hello from " << prc_recv.get(i) << " to " << vcl.getProcessUnitID(); std::string str(msg.str()); BOOST_REQUIRE_EQUAL(std::equal(str.c_str(),str.c_str() + str.size() ,&(recv_message.get(i).get(0))),true); } // Reply back // Create the message prc.clear(); message.clear(); for (size_t i = 0 ; i < prc_recv.size() ; i++) { prc.add(prc_recv.get(i)); message.add(); std::ostringstream msg; msg << "Hey from " << vcl.getProcessUnitID() << " to " << prc_recv.get(i); std::string str(msg.str()); message.last().resize(str.size()); std::copy(str.c_str(),&(str.c_str())[msg.str().size()],&(message.last().get(0))); message.last().resize(j); } id = 0; prc_recv.clear(); recv_message.clear(); commFunc<ip>(vcl,prc,message,msg_alloc3,&recv_message); // Check if the received hey message match the original send BOOST_REQUIRE_EQUAL(o_send.size(),prc_recv.size()); for (size_t i = 0 ; i < o_send.size() ; i++) { size_t j = 0; for ( ; j < prc_recv.size() ; j++) { if (o_send.get(i) == prc_recv.get(j)) { // found the message check it std::ostringstream msg; msg << "Hey from " << prc_recv.get(i) << " to " << vcl.getProcessUnitID(); std::string str(msg.str()); BOOST_REQUIRE_EQUAL(std::equal(str.c_str(),str.c_str() + str.size() ,&(recv_message.get(i).get(0))),true); break; } } // Check that we find always a match BOOST_REQUIRE_EQUAL(j != prc_recv.size(),true); } } // Check long communication pattern for (size_t j = 32 ; j < N_LOOP ; j*=2) { global_step = j; // Processor step long int ps = n_proc / (8 + 1); // send message openfpm::vector<openfpm::vector<unsigned char>> message; // recv message openfpm::vector<openfpm::vector<unsigned char>> recv_message(n_proc); openfpm::vector<size_t> prc; for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { size_t p_id = ((i+1) * ps + vcl.getProcessUnitID()) % n_proc; if (p_id != vcl.getProcessUnitID()) { prc.add(p_id); message.add(); std::ostringstream msg; msg << "Hello from " << vcl.getProcessUnitID() << " to " << p_id; std::string str(msg.str()); message.last().resize(j); memset(message.last().getPointer(),0,j); std::copy(str.c_str(),&(str.c_str())[msg.str().size()],&(message.last().get(0))); } } recv_message.resize(n_proc); // The pattern is not really random preallocate the receive buffer for (size_t i = 0 ; i < 8 && i < n_proc ; i++) { long int p_id = (- (i+1) * ps + (long int)vcl.getProcessUnitID()); if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) recv_message.get(p_id).resize(j); } #ifdef VERBOSE_TEST timer t; t.start(); #endif commFunc<ip>(vcl,prc,message,msg_alloc,&recv_message); #ifdef VERBOSE_TEST t.stop(); double clk = t.getwct(); double clk_max = clk; size_t size_send_recv = 2 * j * (prc.size()); vcl.sum(size_send_recv); vcl.max(clk_max); vcl.execute(); if (vcl.getProcessUnitID() == 0) std::cout << "(Long pattern: " << method<ip>() << ")Buffer size: " << j << " Bandwidth (Average): " << size_send_recv / vcl.getProcessingUnits() / clk / 1e6 << " MB/s " << " Bandwidth (Total): " << size_send_recv / clk / 1e6 << " MB/s Clock: " << clk << " Clock MAX: " << clk_max <<"\n"; #endif // Check the message for (long int i = 0 ; i < 8 && i < (long int)n_proc ; i++) { long int p_id = (- (i+1) * ps + (long int)vcl.getProcessUnitID()); if (p_id < 0) p_id += n_proc; else p_id = p_id % n_proc; if (p_id != (long int)vcl.getProcessUnitID()) { std::ostringstream msg; msg << "Hello from " << p_id << " to " << vcl.getProcessUnitID(); std::string str(msg.str()); BOOST_REQUIRE_EQUAL(std::equal(str.c_str(),str.c_str() + str.size() ,&(recv_message.get(p_id).get(0))),true); } else { BOOST_REQUIRE_EQUAL((size_t)0,recv_message.get(p_id).size()); } } } } }