void ArrayMesh::set_mesh( int coords_per_vertex,
                          unsigned long num_vertices,
                          double* interleaved_vertex_coords,
                          const int* vertex_fixed_flags,
                          unsigned long num_elements,
                          EntityTopology element_type,
                          const unsigned long* element_connectivity_array,
                          bool one_based_conn_indices,
                          unsigned nodes_per_element,
                          const int* vertex_slaved_flags ) 
{
  clear_mesh();
  mDimension = coords_per_vertex;
  vertexCount = num_vertices;
  coordArray = interleaved_vertex_coords;
  fixedFlags = vertex_fixed_flags;
  slavedFlags = vertex_slaved_flags;
  elementCount = num_elements;
  connArray = element_connectivity_array;
  elementType = element_type;
  oneBasedArrays = one_based_conn_indices;
  
  if (oneBasedArrays) {
    coordArray -= mDimension;
    --fixedFlags;
  }
  
  if (nodes_per_element < 2)
    nodesPerElement = TopologyInfo::corners( element_type );
  else
    nodesPerElement = nodes_per_element;
    
  vertexByteArray = new unsigned char[num_vertices + one_based_conn_indices];
  assert(valid());
  memset( vertexByteArray, 0, num_vertices + one_based_conn_indices );
}
Exemple #2
0
  void system::set_problem(const bool init) 
  {
    if (myproc == 0)
      fprintf(stderr, " ********* Setting up Orszag-Tang vortex ************* \n");

    const real b0 = 1.0/sqrt(4.0*M_PI);
    const real d0 = 25.0/(36.0*M_PI);
    const real v0 = 1.0;
    const real p0 = 5.0/(12*M_PI);
    gamma_gas = 5.0/3;
    courant_no = 0.8;

    if (!init) return;

    U_local.resize(local_n);
    dU_local.resize(local_n);
    Wrec_local.resize(local_n);

    const real adv = 0.0;
    double dt_min = HUGE;
    for (int i = 0; i < (int)local_n; i++) 
    {
      const Particle &pi = ptcl_local[i];

      real x = pi.pos.x;
      real y = pi.pos.y;


      real d, p, vx, vy, vz, bx, by, bz;

      vx = -v0 * sin(2.0*M_PI*y) + adv;
      vy = +v0 * sin(2.0*M_PI*x) + adv;
      vz =  0.0;

      bx = -b0*sin(2*M_PI*y);
      by = +b0*sin(4*M_PI*x);
      bz = 0.0;

      //			bx = by = bz= 0;

      //			bz = b0;

      d = d0;
      p = p0;
      real scal = 1;

#ifdef __ADVECT_PULSE_TEST__

      vx = 0;
      vy = 0;
      vz = 0;
      bx = by = bz = 0;
      p = 1.0;
      d = 1.0;
      //     vx = 1; vy = 0;
      //     if (x > 0.3 && x < 0.7) d = 2;

      vx = 0; vy = 1;
      if (y > 0.25 && y < 0.75) {
        d = 10;
        //				p = 1;
      }

#endif

#if 0
      bx = by = bz = 0;
#endif


      Fluid m;

      m[Fluid::DENS] = d ;
      m[Fluid::ETHM] = p/(gamma_gas - 1.0);
      m[Fluid::VELX] = vx;
      m[Fluid::VELY] = vy;
      m[Fluid::VELZ] = vz;
      m[Fluid::BX  ] = bx;
      m[Fluid::BY  ] = by;
      m[Fluid::BZ  ] = bz;
      m[Fluid::PSI ] = 0.0;
      m[Fluid::ENTR] = compute_entropy_from_ethm(m);
      for (int k = 0 ; k < Fluid::NSCALARS; k++)
        m.scal(k) = scal;

      Wrec_local[i]        = Fluid_rec(m);
      U_local  [i]       = m.to_conservative(cell_local[i].Volume);
      dU_local[i] = 0.0;
      ptcl_local[i].volume = cell_local[i].Volume;

      const double L      = std::pow(cell_local[i].Volume, 1.0/3);
      const double cs_est = std::sqrt((p*gamma_gas + (bx*bx+by*by+bz*bz))/d);
      const double v_est  = std::sqrt(vx*vx + vy*vy + vz*vz);
      const double dt_est = 0.1 * courant_no * L/(cs_est + v_est);

      ptcl_local[i].tlast = 0.0;
      ptcl_local[i].rung = scheduler.get_rung(dt_est);

      dt_min = std::min(dt_min, dt_est);

    }

    double dt_min_glob;
    MPI_Allreduce(&dt_min, &dt_min_glob, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0)
      fprintf(stderr , " pvel ... \n");

    get_active_ptcl(true);

    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0)
      fprintf(stderr , " pvel ... \n");


    cell_list.swap(cell_local);
    ptcl_import.swap(ptcl_local);
    U_import.swap(U_local);
    site_active_list.swap(active_ptcl);


    compute_pvel();
    compute_timesteps(true);

    cell_list.swap(cell_local);
    ptcl_import.swap(ptcl_local);
    U_import.swap(U_local);
    site_active_list.swap(active_ptcl);

    for (int i = 0; i < (int)local_n; i++)
    {
      ptcl_local[i].rung += 1;
      ptcl_local[i].orig_vel = ptcl_local[i].vel;
    }
    all_active = true;
    scheduler.flush_list();
    for (int i = 0; i < (int)local_n; i++)
    {
      scheduler.push_particle(i, (int)ptcl_local[i].rung);
      ptcl_local[i].tend  = 0.0 + scheduler.get_dt(ptcl_local[i].rung);
    }
    boundary_n = 0;

    clear_mesh(true);

    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0) fprintf(stderr, " proc= %d: complete problem setup \n", myproc);
  }
Exemple #3
0
  void system::set_geometry(const bool init) 
  {
    const double dt_max = 1.0/64 ; //16; // 1.0/128;
    scheduler = Scheduler(dt_max);

    t_end   = 2.5;

    n_restart = 1;
    dt_restart = dt_max;

    dt_dump = dt_max / 16;
    dt_dump = dt_max ; //* 4;

    di_log = 100;

    global_n = local_n = 0;
    int nx = 16;
    int ny = 16;
    int nz = 16;

    //		nx = ny = nz = 32;

    //		nx = ny = nz = 64;

    nx = ny = 32; nz = 32;

    //		nx = ny = 32; nz = 16;
    nx = ny = 64; nz = 16;
     nx = ny = 128; nz = 16;
    // nx = ny = 256; nz = 16;


    //		nx = ny = 256;	nz = 256;


    //		nx = ny = nz = 128;

    //		eulerian = true;


#if 0
#if 1
#define __ADVECT_PULSE_TEST__
    nx = ny = 64;	nz = 16;
    dt_dump = dt_max;
    dt_restart = 1e10;
#endif
    //		nx = ny = 128;
#endif


    const double Lx = 1.0;
    const vec3 rmin(0.0);
    const vec3 rmax(Lx, (Lx/nx)*ny, (Lx/nx)*nz);
    global_domain = boundary(rmin, rmax);
    global_domain_size = global_domain.hsize() * 2.0;

    const vec3 Len3 = global_domain.hsize() * 2.0;
    pfloat<0>::set_scale(Len3.x);
    pfloat<1>::set_scale(Len3.y);
    pfloat<2>::set_scale(Len3.z);

    Distribute::int3 nt(1, 1, 1);
    switch(nproc) 
    {
      case 1: break;
      case 2: nt.x = 2; nt.y = 1; nt.z = 1; break;
      case 4: nt.x = 2; nt.y = 2; nt.z = 1; break;
      case 8: nt.x = 4; nt.y = 2; nt.z = 1; break;
      case 16: nt.x = 4; nt.y = 4; nt.z = 1; break;
      case 32: nt.x = 8; nt.y = 4; nt.z = 1; break;
      case 64: nt.x = 8; nt.y = 8; nt.z = 1; break;
      case 128: nt.x = 8; nt.y = 8; nt.z = 2; break;
      default: assert(false);
    }

    const Distribute::int3 nt_glb(nt);
    const pBoundary pglobal_domain(pfloat3(0.0), pfloat3(Len3));
    distribute_glb.set(nproc, nt, pglobal_domain);

    if (!init) return;

    if (myproc == 0) 
    {

      ptcl_local.clear();
      ptcl_local.reserve(128);

      const dvec3 dr = dvec3(Len3.x/nx, Len3.y/ny, Len3.z/nz);
      const real rmax = dr.abs() * 1.0;

      fprintf(stderr, "dr= %g %g %g \n", dr.x, dr.y, dr.z);
      fprintf(stderr, "rmin= %g %g %g \n", 
          global_domain.get_rmin().x,
          global_domain.get_rmin().y,
          global_domain.get_rmin().z);
      fprintf(stderr, "rmax= %g %g %g \n", 
          global_domain.get_rmax().x,
          global_domain.get_rmax().y,
          global_domain.get_rmax().z);

      for (int k = 0; k < nz; k++) {
        for (int j = 0; j < ny; j++) {
          for (int i = 0; i < nx; i++) {
            dvec3 pos = global_domain.get_rmin() + dvec3(i*dr.x, j*dr.y, k*dr.z) + 0.5*dr;
            const int ijk = (k*ny + j)*nx + i;
#if 0
            if (!eulerian)
            {
              const real f = 1.0e-6;
              pos += vec3(drand48()*dr.x*f, drand48()*dr.y*f, drand48()*dr.z*f);
            }
#endif


#if 1
            pos = global_domain.get_rmin() + dvec3(
                drand48()*Len3.x,
                drand48()*Len3.y,
                drand48()*Len3.z);
#else
#define _UNIFORM_MESH_
#endif
            dvec3 vel(0.0, 0.0, 0.0);
            Particle p;
            p.set_pos(pos);
            p.vel = vel;
            p.orig_vel = p.vel;
            p.boundary = 0;
            p.idx = ijk;
            p.rmax = rmax;
            ptcl_local.push_back(p);
          }
        }
      }
      local_n  = ptcl_local.size();
      global_n = local_n;

      fprintf(stderr, "  *** proc= %d : local_n= %llu  global_n= %llu \n", myproc, local_n, global_n);
    } // myproc == 0

    MPI_Bcast(&global_n,  1, MPI_INT, 0, MPI_COMM_WORLD);

    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0)
      fprintf(stderr, " ***  Distrubiting data \n");

    all_active = true;

    for (int k = 0; k < 5; k++)
      distribute_data(false,false,false);

#if 0
    std::vector< std::pair<int, TREAL> > rmax_list;
    local_tree.root.get_rmax(rmax_list);
    assert((int)rmax_list.size() == local_n);
    for (int i = 0; i < local_n; i++)
      ptcl[rmax_list[i].first].rmax = rmax_list[i].second;
#endif

    MPI_Barrier(MPI_COMM_WORLD);
    fprintf(stderr, " *** proc= %d : local_n= %llu  global_n= %llu \n", myproc, local_n, global_n);
    fprintf(stderr, " proc= %d  relax \n", myproc);

#ifndef _UNIFORM_MESH_
    relax_mesh(5);
#endif
    fprintf(stderr, " ---- done --- \n");
    {
      distribute_data(false, false, false);
      const double t10 = mytimer::get_wtime();
      clear_mesh(false);
      int nattempt = build_mesh_global();
      double dt10 = mytimer::get_wtime() - t10;

      double volume_loc = 0.0;
      {
        std::vector<TREAL> v(local_n);
        for (int i = 0; i < (int)local_n; i++)
          v[i] = cell_local[i].Volume;
        std::sort(v.begin(), v.end());  // sort volumes from low to high, to avoid roundoff errors
        for (int i = 0; i < (int)local_n; i++)
          volume_loc += v[i];
      }


      double dt10max;
      MPI_Allreduce(&dt10, &dt10max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
      double volume_glob = 0.0;	
      int    nattempt_max, nattempt_min;
      MPI_Allreduce(&volume_loc, &volume_glob,  1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
      MPI_Allreduce(&nattempt,   &nattempt_max, 1, MPI_INT,    MPI_MAX, MPI_COMM_WORLD);
      MPI_Allreduce(&nattempt,   &nattempt_min, 1, MPI_INT,    MPI_MIN, MPI_COMM_WORLD);

      const double volume_exact = global_domain_size.x*global_domain_size.y*global_domain_size.z;
      if (myproc == 0)
      {
        fprintf(stderr, "first call build_mesh:[ %g  sec ::  %g cells/s/proc/thread ]\n",
            dt10max,
            global_n/nproc/dt10max);
        fprintf(stderr, "   computed_volume= %g  exact_volume= %g diff= %g [ %g ]  nattempt= %d %d \n",
            volume_glob, volume_exact, 
            volume_glob - volume_exact,	(volume_glob - volume_exact)/volume_exact,
            nattempt_min, nattempt_max);
      }
    }


    extract_ngb_from_mesh();
#if 0
    set_problem(true);
    iterate();

    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
    exit(-1);
#endif
  }
ArrayMesh::~ArrayMesh()
{
  clear_mesh();
}
Exemple #5
0
	void system::read_binary(const char *filename, const int n_files) 
	{
#if 1
		assert(n_files == 1);

		vec3 rmin, rmax;
		if (myproc == 0) 
		{

			FILE *fin; 
			if (!(fin = fopen(filename, "r"))) 
			{
				std::cerr << "Cannot open file " << filename << std::endl;
				exit(-1);
			}

			std::cerr << "proc= " << myproc << " read snapshot: " << filename << std::endl;

			int ival;
			float fval;

#define fload(x) { myfread(&fval, sizeof(float), 1, fin); x = fval; }
#define iload(x) { myfread(&ival, sizeof(int),   1, fin); x = ival;}

			float ftmp;
			int itmp, np0, npx, npy, npz;
			iload(itmp); // 20*4
			assert(itmp == 20*4);
			iload(itmp); // myid
			iload(np0);
			iload(npx);

			union 
			{
				unsigned long long uint_long;
				unsigned int       uint[2];
			} data;
			iload(data.uint[0]);
			iload(data.uint[1]);
			scheduler.tsysU = data.uint_long;

			float courant_No;
			int nglob, nloc, ndim;
			iload(nglob);
			iload(nloc);
			iload(ndim);  
			assert(ndim == 3);
			fload(t_global);
			fload(dt_global);
			iload(iteration);
			fload(courant_No);
			fload(gamma_gas);

			int periodic_on;
			iload(periodic_on);
			assert(periodic_on == -1);
			
			fload(rmin.x);
			fload(rmin.y);
			fload(rmin.z);
			fload(rmax.x);
			fload(rmax.y);
			fload(rmax.z);
			iload(itmp);     // 20*4
			assert(itmp == 20*4);

			ptcl_local.resize(nglob);
			U_local.resize(nglob);
			dU_local.resize(nglob);

			fprintf(stderr, "np =%d   nglob= %d \n", np0, nglob);

			int pc = 0;
			for (int pr = 0; pr < np0; pr++) 
			{
				fprintf(stderr, " p= %d out of %d; nloc= %d\n", pr, np0, nloc);
				for (int i = 0; i < nloc; i++) 
				{
					Particle p;
          p.tend  = t_global;
          p.rung  = 0.0;
          p.new_dt = 0.0;
          p.local_id = i;
					Fluid W(0.0);

					iload(ival);    
					assert(ival == 26*4);
					iload(ival); p.idx = ival;
				
					fload(p.pos.x); 
					fload(p.pos.y);
					fload(p.pos.z);

					p.pos = periodic(p.pos);

					assert(rmin.x <= p.pos.x);
					assert(rmax.x >= p.pos.x);
					assert(rmin.y <= p.pos.y);
					assert(rmax.y >= p.pos.y);
					assert(rmin.z <= p.pos.z);
					assert(rmax.z >= p.pos.z);
          p.orig_pos = p.pos;
          p.pot = 0;


					fload(p.vel.x);
					fload(p.vel.y);
					fload(p.vel.z);
          p.orig_vel = p.vel;
					fload(W[Fluid::DENS]);
					fload(W[Fluid::ETHM]);
					fload(ftmp); // compute_pressure(m.dens, m.ethm));
					fload(p.rmax);     //dump(    (sqr(m.B.x  ) + sqr(m.B.y  ) + sqr(m.B.z  ))*0.5f);
					iload(p.boundary); //				fload(ftmp); //dump(sqrt(sqr(m.vel.x) + sqr(m.vel.y) + sqr(m.vel.z))); 
					fload(W[Fluid::VELX]);
					fload(W[Fluid::VELY]);
					fload(W[Fluid::VELZ]);
					fload(W[Fluid::BX]);
					fload(W[Fluid::BY]);
					fload(W[Fluid::BZ]);
					float h;
					fload(h);
					fload(p.volume);
          p.volume_new = p.volume;
					fload(W[Fluid::PSI]);
					fload(ftmp); //L*divB_i[i]);
					fload(W[Fluid::ENTR]);
					fload(ftmp); // Jx
					fload(ftmp); // Jy
					fload(ftmp); // Jz
					iload(ival); 
					assert(ival == 26*4);

					p.tlast = t_global;

					ptcl_local[pc] = p;
					U_local    [pc] = W;
          dU_local   [pc] = 0.0;
          dU_local   [pc] = 0.0;
					pc++;
				}

				fprintf(stderr, "p= %d  np0= %d size= %d %d\n",
						pr, np0, (int)U_local.size(), (int)ptcl_local.size());
				if (!(pr < np0-1)) break;
				iload(itmp); // 20*4
				assert(itmp == 20*4);
				iload(itmp); // myid
				iload(np0);
				iload(npx);
				iload(npy);
				iload(npz);

				int nglob1;
				iload(nglob1);
				if (nglob != nglob1) {
					fprintf(stderr, "np; npx, npy, npz = %d; %d %d %d \n", 
							np0, npx, npy, npz);
					fprintf(stderr, "nglob= %d  nglob1= %d\n", nglob, nglob1);
				}
				assert(nglob == nglob1);
				iload(nloc);
				iload(ndim);
				fload(t_global);
				fload(dt_global);
				iload(iteration);
				fload(courant_No);
				fload(gamma_gas);

				iload(periodic_on);

				fload(rmin.x);
				fload(rmin.y);
				fload(rmin.z);
				fload(rmax.x);
				fload(rmax.y);
				fload(rmax.z);
				iload(itmp);     // 20*4
			}
			assert(pc == nglob);
			assert(nglob == (int)U_local.size());
			fclose(fin);


			local_n = U_local.size();
		}

		global_n = U_local.size();		      
		local_n = global_n;

		MPI_Bcast(&global_n,   1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD);
		MPI_Bcast(&iteration,  1, MPI_INT, 0, MPI_COMM_WORLD);

		double dt_glob = dt_global;
		double  t_glob =  t_global;
		MPI_Bcast(& t_glob,  1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		MPI_Bcast(&dt_glob,  1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
		MPI_Bcast(&scheduler.tsysU, 1, MPI_UNSIGNED_LONG_LONG, 0, MPI_COMM_WORLD);
		dt_global = dt_glob;
		t_global  =  t_glob;
    scheduler.set_tsys(t_global);
		assert(t_global == scheduler.get_tsys());


//		scheduler.tsysU = (unsigned long long)(t_global / scheduler.dt_tick);
		scheduler.min_rung = 0;

		dt_global = 0.0f;
		
    distribute_data(true, false, true);
	
#if 1	
		fit_vec(ptcl_local);
		fit_vec(U_local);
		fit_vec(dU_local);
		fit_vec(Wrec_local);
#endif

		all_active = true;
		
		MPI_Barrier(MPI_COMM_WORLD);

		for (int i = 0; i < (int)local_n; i++)
		{
			ptcl_local[i].tlast = t_global;
//      ptcl_local[i].volume = cell_local[i].Volume;

      Wrec_local[i] = Fluid_rec(U_local[i]);
      U_local[i] = U_local[i].to_conservative(ptcl_local[i].volume);
      dU_local[i] = 0.0;
		}
    
    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0)
      fprintf(stderr , " pvel ... \n");

    get_active_ptcl(true);

    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0)
      fprintf(stderr , " pvel ... \n");


    cell_list.swap(cell_local);
    ptcl_import.swap(ptcl_local);
    U_import.swap(U_local);
    site_active_list.swap(active_ptcl);

    compute_pvel();
    compute_timesteps(true);

    cell_list.swap(cell_local);
    ptcl_import.swap(ptcl_local);
    U_import.swap(U_local);
    site_active_list.swap(active_ptcl);

    for (int i = 0; i < (int)local_n; i++)
    {
      ptcl_local[i].rung += 1;
      ptcl_local[i].tend  = ptcl_local[i].tlast + scheduler.get_dt(ptcl_local[i].rung);
      ptcl_local[i].orig_vel = ptcl_local[i].vel;
      ptcl_local[i].unset_active();
    }
    all_active = true;
    scheduler.flush_list();
    boundary_n = 0;
    for (int i = 0; i < (int)local_n; i++)
    {
      scheduler.push_particle(i, (int)ptcl_local[i].rung);
      if (ptcl_local[i].is_boundary())
        boundary_n++;
    }

    unsigned long long boundary_glb;
    MPI_Allreduce(&boundary_n, &boundary_glb, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);

    if (myproc == 0)
      fprintf(stderr, "boundary_glb= %lld\n", boundary_glb);

    clear_mesh(true);

    MPI_Barrier(MPI_COMM_WORLD);
    if (myproc == 0) fprintf(stderr, " proc= %d: complete read_binary \n", myproc);
#endif
  }
void system::distribute_data(const bool FLUID, const bool GRADS, const bool NGB)
{
    distribute_data_flag = true;

    ptcl_local.resize(local_n);
    std::vector<vec3> ptcl_pos(local_n);

    // compute integer coordinates for each position
    //
    for (int i = 0; i < (int)local_n; i++)
    {
        ptcl_local[i].local_id = i;
        ptcl_local[i].orig_pos = periodic(ptcl_local[i].orig_pos);
        ptcl_pos[i] = ptcl_local[i].orig_pos;
    }


    // determine domain decomposition
    //
    std::vector<vec3> sample_pos;
    determine_sampling_freq();
    collect_sample_data(sample_pos, ptcl_pos);
    DistributeNew<real, vec3, boundary> distribute(nproc, global_domain);
    if (myproc == 0)
    {
        //			distribute_glb.determine_division(sample_pos);
#if 1
        distribute.determine_division(sample_pos, nproc*32);
#else
        distribute.determine_division(sample_pos, nproc*8);
#endif
    }

    myMPI::Bcast(distribute.tiles, 0, nproc);
    myMPI::Bcast(distribute.procs, 0, nproc);

    compute_proc_domain(distribute.tiles, distribute.procs);

    if (FLUID && GRADS)
        for (int i = 0; i < (int)local_n; i++)
            Wrec_local[i].pos.x = divBi[i];

    int iloc = 0;

    std::vector<Particle> ptcl_send[NMAXPROC];
    std::vector<Particle> ptcl_recv[NMAXPROC];
    std::vector<ParticleFluidStruct> fluid_send[NMAXPROC];
    std::vector<ParticleFluidStruct> fluid_recv[NMAXPROC];
    std::vector<ParticleFluidStructLite> fluidlite_send[NMAXPROC];
    std::vector<ParticleFluidStructLite> fluidlite_recv[NMAXPROC];

#if 0
    std::vector<int> ngb_send[NMAXPROC];
    std::vector<int> ngb_recv[NMAXPROC];
#endif

    std::vector<int> remote_tiles;
    int nremove = 0;
    for (int i = 0; i < (int)local_n; i++)
    {
        remote_tiles.clear();
        proc_tree.root.walk_boundary(boundary(ptcl_pos[i]), remote_tiles, global_domain_size);
        assert(remote_tiles.size() > 0);
        const int proc  = proc_procs[remote_tiles[0]];

        assert(proc >= 0);
        assert(proc < nproc);

        if (proc == myproc && !ptcl_local[i].is_remove())
        {
            std::swap(ptcl_local[i], ptcl_local[iloc]);
            std::swap(ptcl_pos  [i], ptcl_pos  [iloc]);
            if (FLUID)
            {
                std::swap(   U_local[i],    U_local[iloc]);
                std::swap(  dU_local[i],   dU_local[iloc]);
                if (GRADS)
                    std::swap(Wrec_local[i], Wrec_local[iloc]);
            }
            iloc++;
        }
        else if (!ptcl_local[i].is_remove())
        {
            if (FLUID && GRADS)
                fluid_send[proc].push_back(ParticleFluidStruct(ptcl_local[i], U_local[i], dU_local[i], Wrec_local[i]));
            else if (FLUID)
                fluidlite_send[proc].push_back(ParticleFluidStructLite(ptcl_local[i], U_local[i], dU_local[i]));
            else
                ptcl_send[proc].push_back(ptcl_local[i]);
        }
        else
            nremove++;
    }


#if 0
    if (FLUID && GRADS)	myMPI::all2all(fluid_send,     fluid_recv,     myproc, nproc, mpi_debug_flag);
    else if (FLUID    )	myMPI::all2all(fluidlite_send, fluidlite_recv, myproc, nproc, mpi_debug_flag);
    else								myMPI::all2all(ptcl_send,      ptcl_recv,      myproc, nproc, mpi_debug_flag);
#else
    {
        static int nsend[NMAXPROC], nrecv[NMAXPROC];
        if (FLUID && GRADS)	myMPI::all2all<true>(fluid_send,     fluid_recv,     myproc, nproc, 1, nsend, nrecv);
        else if (FLUID    )	myMPI::all2all<true>(fluidlite_send, fluidlite_recv, myproc, nproc, 1, nsend, nrecv);
        else								myMPI::all2all<true>(ptcl_send,      ptcl_recv,      myproc, nproc, 1, nsend, nrecv);
    }
#endif

    int nrecv = 0;
    if (FLUID && GRADS)
        for (int p = 0; p < nproc; p++)
            nrecv += fluid_recv[p].size();
    else if (FLUID)
        for (int p = 0; p < nproc; p++)
            nrecv += fluidlite_recv[p].size();
    else
        for (int p = 0; p < nproc; p++)
            nrecv += ptcl_recv[p].size();

    {
        const int nloc = iloc + nrecv;

        ptcl_local.resize(nloc);
        fit_vec(ptcl_local);
        U_local   .resize(nloc);
        fit_vec(U_local);
        dU_local  .resize(nloc);
        fit_vec(dU_local);
        Wrec_local.resize(nloc);
        fit_vec(Wrec_local);
        divBi    .resize(nloc);
        fit_vec(divBi);
        Wextra_local.resize(nloc);
        fit_vec(Wextra_local);
    }

    for (int p = 0; p < nproc; p++)
        for (size_t q = 0; q < (FLUID ? (GRADS ? fluid_recv[p].size() : fluidlite_recv[p].size()) : ptcl_recv[p].size()); q++)
        {
            assert(p != myproc);

            if (FLUID && GRADS)
            {
                ptcl_local[iloc] = fluid_recv[p][q].p;
                U_local   [iloc] = fluid_recv[p][q].U;
                dU_local  [iloc] = fluid_recv[p][q].dU;
                Wrec_local[iloc] = fluid_recv[p][q].Wrec;
            }
            else if (FLUID)
            {
                ptcl_local[iloc] = fluidlite_recv[p][q].p;
                U_local   [iloc] = fluidlite_recv[p][q].U;
                dU_local  [iloc] = fluidlite_recv[p][q].dU;
            }
            else
                ptcl_local[iloc] = ptcl_recv[p][q];

            assert(!ptcl_local[iloc].is_remove());
            iloc++;
        }

    local_n  = iloc;

    assert(iloc = (int)ptcl_local.size());

    if (FLUID && GRADS)
        for (int i = 0; i < (int)local_n; i++)
        {
            divBi[i] = Wrec_local[i].pos.x;
            Wrec_local[i].pos.x = ptcl_local[i].pos.x;
        }


    unsigned long long nglob, nloc = local_n;
    unsigned long long nvirt_glob;
    virtual_n = nremove;
    MPI_Allreduce(&nloc, &nglob, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);
    MPI_Allreduce(&virtual_n, &nvirt_glob, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);

    unsigned long long local_n_min, local_n_max, local_n_mean;
    MPI_Allreduce(&local_n, &local_n_min,  1, MPI_UNSIGNED_LONG_LONG, MPI_MIN, MPI_COMM_WORLD);
    MPI_Allreduce(&local_n, &local_n_max,  1, MPI_UNSIGNED_LONG_LONG, MPI_MAX, MPI_COMM_WORLD);
    MPI_Allreduce(&local_n, &local_n_mean, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD);

    if (myproc == 0)
    {
        fprintf(stderr, "local_n= [min: %llu max: %llu ; mean: %llu ]  global_n= %llu nglob= %llu  remove_n_glob= %llu \n",
                local_n_min, local_n_max, local_n_mean/nproc, global_n, nglob, nvirt_glob);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    assert(nglob == global_n - nvirt_glob);
    global_n = nglob;
    virtual_n = 0;

    sort_local_data();

    // build local tree
    //
    global_domain_size = global_domain.hsize() * 2.0;
    local_tree.clear();
    local_tree.set_domain(
        boundary(
            global_domain.centre() - global_domain.hsize()*1.5,
            global_domain.centre() + global_domain.hsize()*1.5));

    std::vector<Octree::Particle> tree_ptcl(local_n);
    for (int i = 0; i < (int)local_n; i++)
    {
        assert(!ptcl_local[i].is_remove());
        tree_ptcl[i] = Octree::Particle(ptcl_local[i].orig_pos, i);
    }
    local_tree.insert(&tree_ptcl[0], local_n, 0, local_n);

    local_tree.get_leaves();
    local_tree.root.inner_boundary();

    if (NGB)
    {
        if (myproc == 0)
            fprintf(stderr, "---buidling mesh---\n");
        const double t10 = mytimer::get_wtime();
        clear_mesh(false);

        const double t15 = mytimer::get_wtime();
        build_mesh_global();
        double dt_mesh = mytimer::get_wtime() - t15;

        double volume_loc = 0.0;
        {
            std::vector<TREAL> v(local_n);
            for (int i = 0; i < (int)local_n; i++)
            {
                v[i] = cell_local[i].Volume;
                ptcl_local[i].volume_new = v[i];
                ptcl_local[i].local_id = i;
            }
            std::sort(v.begin(), v.end());  // sort volumes from low to high, to avoid roundoff errors
            for (int i = 0; i < (int)local_n; i++)
                volume_loc += v[i];
        }
        extract_ngb_from_mesh();
        clear_mesh(true);
        double dt = mytimer::get_wtime() - t10;

        double volume_glob = 0.0;
        double dt_max = 0.0;
        double dt_mesh_max = 0.0;
        MPI_Allreduce(&volume_loc, &volume_glob,  1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
        MPI_Allreduce(&dt, &dt_max,  1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
        MPI_Allreduce(&dt_mesh, &dt_mesh_max,  1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
        const double volume_exact = global_domain_size.x*global_domain_size.y*global_domain_size.z;
        if (myproc == 0)
        {
            fprintf(stderr, " distribute::build_mesh:[ %g (all %g )  sec ::  %g cells/s/proc/thread ]\n",
                    dt_mesh_max, dt_max,
                    global_n/nproc/dt_mesh_max);
            fprintf(stderr, "   computed_volume= %g  exact_volume= %g diff= %g [ %g ] \n",
                    volume_glob, volume_exact,
                    volume_glob - volume_exact,	(volume_glob - volume_exact)/volume_exact);
        }
    }
}
	void system::set_geometry(const bool init) 
	{
		const double dt_max = 1.0/512;
		scheduler = Scheduler(dt_max);

		int np;
		float lx, ly, lz;
		FILE *fin = NULL;
		if (myproc == 0)
		{
			float wp;
			fin = fopen(fin_data, "r");
			int ival;
			size_t nread;

			nread = fread(&ival, sizeof(int), 1, fin);		assert(ival == 2*sizeof(int));
			nread = fread(&np, sizeof(int), 1, fin);
			nread = fread(&wp, sizeof(float), 1, fin);
			nread = fread(&ival, sizeof(int), 1, fin);		assert(ival == 2*sizeof(int));
			
			nread = fread(&ival, sizeof(int), 1, fin);		assert(ival == 3*sizeof(float));
			nread = fread(&lx, sizeof(float), 1, fin);
			nread = fread(&ly, sizeof(float), 1, fin);
			nread = fread(&lz, sizeof(float), 1, fin);
			nread = fread(&ival, sizeof(int), 1, fin);		assert(ival == 3*sizeof(float));

			fprintf(stderr, " np= %d  wp= %g \n",np, wp);
			fprintf(stderr, " lx= %g  ly= %g  lz= %g \n", lx, ly, lz);
		}

		MPI_Bcast(&lx,  1, MPI_FLOAT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&ly,  1, MPI_FLOAT, 0, MPI_COMM_WORLD);
		MPI_Bcast(&lz,  1, MPI_FLOAT, 0, MPI_COMM_WORLD);

		t_end   = 0.2;

		n_restart = 2;
		dt_restart = dt_max;

		dt_dump = 0.01;

		di_log = 100;

		global_n = local_n = 0;

//		eulerian = true;

		const vec3 rmin(0.0);
		const vec3 rmax(lx, ly, lz);
		global_domain = boundary(rmin, rmax);
		global_domain_size = global_domain.hsize() * 2.0;

		const vec3 Len3 = global_domain.hsize() * 2.0;
		pfloat<0>::set_scale(Len3.x);
		pfloat<1>::set_scale(Len3.y);
		pfloat<2>::set_scale(Len3.z);

		if (myproc == 0) 
		{

			ptcl.resize(np);

			const int nx = (int)std::pow(np, 1.0/3.0);
			const dvec3 dr = dvec3(Len3.x/nx, Len3.y/nx, Len3.z/nx);
			const real rmax = dr.abs() * 1.0;

			fprintf(stderr, "dr= %g %g %g \n", dr.x, dr.y, dr.z);

			local_n  = ptcl.size();
			global_n = local_n;

			{
				std::vector<float> x(local_n), y(local_n), z(local_n);
				size_t nread;
				int ival;

				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				nread = fread(&x[0], sizeof(float), local_n, fin);
				assert((int)nread == local_n);
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				nread = fread(&y[0], sizeof(float), local_n, fin);
				assert((int)nread == local_n);
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				nread = fread(&z[0], sizeof(float), local_n, fin);
				assert((int)nread == local_n);
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));

				for (int i = 0; i < local_n; i++)
				{
					const dvec3 vel(0.0, 0.0, 0.0);
					ptcl[i] = Particle(x[i], y[i], z[i], vel.x, vel.y, vel.z, i);
					ptcl[i].rmax = rmax;
					ptcl[i].unset_derefine();
				}
			}

			U.resize(local_n);
			const int var_list[7] = {
				Fluid::VELX,
				Fluid::VELY,
				Fluid::VELZ,
				Fluid::DENS,
				Fluid::BX,
				Fluid::BY,
				Fluid::BZ};

			std::vector<float> data(local_n);
			for (int var = 0; var < 7; var++)
			{
				fprintf(stderr, " reading vat %d out of %d \n", var+1, 7);
				int ival;
				size_t nread;
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				nread = fread(&data[0], sizeof(float), local_n, fin);
				assert((int)nread == local_n);
				nread = fread(&ival, sizeof(int), 1, fin); assert(ival == local_n*(int)sizeof(float));
				for (int i = 0; i < local_n; i++)
					U[i][var_list[var]] = data[i];
			}
			for (int i = 0; i < local_n; i++)
			{
				assert(U[i][Fluid::DENS] > 0.0);
				U[i][Fluid::ETHM] = cs2 * U[i][Fluid::DENS];
			}


			fclose(fin);

			fprintf(stderr, "  *** proc= %d : local_n= %d  global_n= %d \n", myproc, local_n, global_n);
		} // myproc == 0

		MPI_Bcast(&global_n,  1, MPI_INT, 0, MPI_COMM_WORLD);

		fprintf(stderr, " proc= %d  distrubite \n", myproc);
		MPI_Barrier(MPI_COMM_WORLD);

		Distribute::int3 nt(1, 1, 1);
		switch(nproc) {
			case 1: break;
			case 2: nt.x = 2; nt.y = 1; nt.z = 1; break;
			case 4: nt.x = 2; nt.y = 2; nt.z = 1; break;
			case 6: nt.x = 3; nt.y = 2; nt.z = 1; break;
			case 8: nt.x = 2; nt.y = 2; nt.z = 2; break;
			case 16: nt.x = 4; nt.y = 2; nt.z = 2; break;
			case 32: nt.x = 4; nt.y = 4; nt.z = 2; break;
			case 64: nt.x = 4; nt.y = 4; nt.z = 4; break;
			case 128: nt.x = 8; nt.y = 4; nt.z = 4; break;
			case 256: nt.x = 8; nt.y = 8; nt.z = 4; break;
			case 512: nt.x = 8; nt.y = 8; nt.z = 8; break;
			default: assert(false);
		}

		const Distribute::int3 nt_glb(nt);
		const pBoundary pglobal_domain(pfloat3(0.0), pfloat3(Len3));
		distribute_glb.set(nproc, nt, pglobal_domain);

		for (int k = 0; k < 5; k++)
			distribute_data(true, false);

		const int nloc_reserve = (int)(2.0*global_n/nproc);
		fit_reserve_vec(ptcl,      nloc_reserve);
		fit_reserve_vec(ptcl_ppos, nloc_reserve);
		fit_reserve_vec(U,         nloc_reserve);
		fit_reserve_vec(dU,        nloc_reserve);
		fit_reserve_vec(Wgrad,     nloc_reserve);
		fit_reserve_vec(gradPsi,   nloc_reserve);
		fit_reserve_vec(cells,     nloc_reserve);

		MPI_Barrier(MPI_COMM_WORLD);

		fprintf(stderr, " *** proc= %d : local_n= %d  global_n= %d \n", myproc, local_n, global_n);
		fprintf(stderr, " proc= %d  building_mesh \n", myproc);

		MPI_Barrier(MPI_COMM_WORLD);



		const double t10 = mytimer::get_wtime();
		clear_mesh();
		int nattempt = build_mesh(true);
		double dt10 = mytimer::get_wtime() - t10;

		double volume_loc = 0.0;
		{
			std::vector<TREAL> v(local_n);
			for (int i = 0; i < local_n; i++)
				v[i] = cells[i].Volume;
			std::sort(v.begin(), v.end());  // sort volumes from low to high, to avoid roundoff errors
			for (int i = 0; i < local_n; i++)
				volume_loc += v[i];
		}


		double dt10max;
		MPI_Allreduce(&dt10, &dt10max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
		double volume_glob = 0.0;	
		int    nattempt_max, nattempt_min;
		MPI_Allreduce(&volume_loc, &volume_glob,  1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
		MPI_Allreduce(&nattempt,   &nattempt_max, 1, MPI_INT,    MPI_MAX, MPI_COMM_WORLD);
		MPI_Allreduce(&nattempt,   &nattempt_min, 1, MPI_INT,    MPI_MIN, MPI_COMM_WORLD);

		const double volume_exact = global_domain_size.x*global_domain_size.y*global_domain_size.z;
		if (myproc == 0)
		{
			fprintf(stderr, "first call build_mesh:[ %g  sec ::  %g cells/s/proc/thread ]\n",
					dt10max,
					global_n/nproc/dt10max);
			fprintf(stderr, "   computed_volume= %g  exact_volume= %g diff= %g [ %g ]  nattempt= %d %d \n",
					volume_glob, volume_exact, 
					volume_glob - volume_exact,	(volume_glob - volume_exact)/volume_exact,
					nattempt_min, nattempt_max);
		}

		exchange_ptcl();

	}
	void system::set_problem(const bool init) 
	{
		if (myproc == 0)
			fprintf(stderr, " ********* Setting up MHD Turbulence ************* \n");

		const int reserve_n = (int)(1.25*local_n);
		U.reserve(reserve_n);
		dU.reserve(reserve_n);
		Wgrad.reserve(reserve_n);

		U.resize(local_n);
		dU.resize(local_n);
		Wgrad.resize(local_n);


		gamma_gas = 1.0;
		courant_no = 0.4;

		for (int i = 0; i < local_n; i++) 
		{
			assert(U[i][Fluid::DENS] > 0.0);
			U[i][Fluid::PSI ] = 0.0;

			for (int k = 0 ; k < Fluid::NSCALARS; k++)
				U[i].scal(k) = 1.0;

			dU[i] = Fluid(0.0);
			Wgrad[i] = 0.0;
			for (int k = 0; k < Fluid::NFLUID; k++)
				Wgrad[i].m[k] = U[i][k];
			U[i] = U[i].to_conservative(cells[i].Volume);
			ptcl[i].Volume = cells[i].Volume;
		}
		entropy_scalar = -1;
		isoeos_flag = true;

		MPI_Barrier(MPI_COMM_WORLD);
		if (myproc == 0)
			fprintf(stderr , " pvel ... \n");

		get_active_ptcl(true);


		MPI_Barrier(MPI_COMM_WORLD);
		if (myproc == 0)
			fprintf(stderr , " primitives ... \n");

		exchange_primitive_and_wdot();


		MPI_Barrier(MPI_COMM_WORLD);
		compute_pvel();
		exchange_pvel();

		MPI_Barrier(MPI_COMM_WORLD);
		if (myproc == 0)
			fprintf(stderr , " tgradients ... \n");
		compute_tgradient();

		if (myproc == 0)
			fprintf(stderr , " timestep... \n");
		compute_timesteps(true);
		for (int i = 0; i < local_n; i++)
			ptcl[i].rung[0] += 3;

		all_active = true;
		scheduler.flush_list();
		for (int i = 0; i < local_n; i++)
			scheduler.push_particle(i, (int)ptcl[i].rung[0]);

		MPI_Barrier(MPI_COMM_WORLD);
		if (!eulerian)
			clear_mesh();

		if (myproc == 0) fprintf(stderr, " proc= %d: complete problem setup \n", myproc);
		MPI_Barrier(MPI_COMM_WORLD);


	}