Exemplo n.º 1
0
void Context::clear() {
  TimerStart("clear");
#ifdef _CTXH
  ObjectMap::Entry *entry = mVars.first();
  while (entry) {
    if (entry->second) {
      if (entry->second->refCount() <= 0) {
        std::ostringstream oss;
        oss << "*** Object in context has already been deleted";
        throw std::runtime_error(oss.str());
      }
      entry->second->decRef();
    }
    entry = mVars.next();
  }
#else
  ObjectMap::iterator it = mVars.begin();
  while (it != mVars.end()) {
    if (it->second) {
      if (it->second->refCount() <= 0) {
        std::ostringstream oss;
        oss << "*** Object \"" << it->first << "\" in context has already been deleted";
        throw std::runtime_error(oss.str());
      }
      it->second->decRef();
    }
    ++it;
  }
#endif
  mVars.clear();
  TimerEnd("clear");
}
Exemplo n.º 2
0
int SystemDestruct(void)
{
  save_paper();             //By zjh

  SetIntSign();         /* cli:: disable creat message */
  TimerEnd();
  FontEnd();
  CloseCache();
  PageFinish();
  MouseDestruct();
  UnlockMouseMemory();

  WindowEnd();
  ItemFinish();
  ChineseLibDone();
  GraphFinish();
  HandleFinish();

  WriteDefaultScreenMode();
  ReturnOK();
}
Exemplo n.º 3
0
/*! This function is the driver routine for the calculation of hydrodynamical
 *  force and rate of change of entropy due to shock heating for all active
 *  particles .
 */
void hydro_force(void)
{
  TimerBeg(90);
  long long ntot, ntotleft;
  int i, j, k, n, ngrp, maxfill, source, ndone;
  int *nbuffer, *noffset, *nsend_local, *nsend, *numlist, *ndonelist;
  int level, sendTask, recvTask, nexport, place;
  double soundspeed_i;
  double tstart, tend, sumt, sumcomm;
  double timecomp = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance;
  MPI_Status status;


#ifdef PERIODIC
  boxSize = All.BoxSize;
  boxHalf = 0.5 * All.BoxSize;
#ifdef LONG_X
  boxHalf_X = boxHalf * LONG_X;
  boxSize_X = boxSize * LONG_X;
#endif
#ifdef LONG_Y
  boxHalf_Y = boxHalf * LONG_Y;
  boxSize_Y = boxSize * LONG_Y;
#endif
#ifdef LONG_Z
  boxHalf_Z = boxHalf * LONG_Z;
  boxSize_Z = boxSize * LONG_Z;
#endif
#endif

  if(All.ComovingIntegrationOn)
    {
      /* Factors for comoving integration of hydro */
      hubble_a = All.Omega0 / (All.Time * All.Time * All.Time)
	+ (1 - All.Omega0 - All.OmegaLambda) / (All.Time * All.Time) + All.OmegaLambda;

      hubble_a = All.Hubble * sqrt(hubble_a);
      hubble_a2 = All.Time * All.Time * hubble_a;

      fac_mu = pow(All.Time, 3 * (GAMMA - 1) / 2) / All.Time;

      fac_egy = pow(All.Time, 3 * (GAMMA - 1));

      fac_vsic_fix = hubble_a * pow(All.Time, 3 * GAMMA_MINUS1);

      a3inv = 1 / (All.Time * All.Time * All.Time);
      atime = All.Time;
    }
  else
    hubble_a = hubble_a2 = atime = fac_mu = fac_vsic_fix = a3inv = fac_egy = 1.0;


  /* `NumSphUpdate' gives the number of particles on this processor that want a force update */
  for(n = 0, NumSphUpdate = 0; n < N_gas; n++)
    {
      if(P[n].Ti_endstep == All.Ti_Current)
	NumSphUpdate++;
    }

  numlist = malloc(NTask * sizeof(int) * NTask);
  MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);
  for(i = 0, ntot = 0; i < NTask; i++)
    ntot += numlist[i];
  free(numlist);


  noffset = malloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
  nbuffer = malloc(sizeof(int) * NTask);
  nsend_local = malloc(sizeof(int) * NTask);
  nsend = malloc(sizeof(int) * NTask * NTask);
  ndonelist = malloc(sizeof(int) * NTask);

  i = 0;			/* first particle for this task */
  ntotleft = ntot;		/* particles left for all tasks together */

	///////////////// GX //////////////////////
	FUN_MESSAGE(2,"hydro_force()");
	#ifdef CUDA_GX_NO_SPH_SUPPORT
		int oldcudamode=s_gx.cudamode;
		s_gx.cudamode=0;
	#endif
	double starttime,subtime=-1,cpytime=-1;
	const int Np=PrintInfoInitialize(N_gas,s_gx.cudamode,1);
	int iter=0;
	///////////////// GX //////////////////////

  while(ntotleft > 0)
    {
	///////////////// GX //////////////////////
	if (s_gx.cudamode!=0 && i!=0) ERROR("cuda mode does not support iterations in hydro calc,  try to increasing the 'BufferSize' in the parameter file to surcomevent this problem");
	iter++;
	///////////////// GX //////////////////////

	for(j = 0; j < NTask; j++)
	nsend_local[j] = 0;

      /* do local particles and prepare export list */
	TimerBeg(91);
	TimerBeg(93);
	starttime=GetTime();

	tstart = second();
if (s_gx.cudamode==0 || (Np!=N_gas || Np<MIN_SPH_PARTICLES_FOR_GPU_GX)) {
//if (s_gx.cudamode==0 || Np<MIN_SPH_PARTICLES_FOR_GPU_GX) {

		#ifdef CUDA_GX_CHUNCK_MANAGER_SPH
			ReLaunchChunkManager();
		#endif

		for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeHydro - NTask; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		{
			ndone++;

			for(j = 0; j < NTask; j++)
				Exportflag[j] = 0;

			hydro_evaluate(i, 0);

			TimerUpdateCounter(91,1);

			for(j = 0; j < NTask; j++)
			{
				if(Exportflag[j])
				{
					for(k = 0; k < 3; k++)
					{
						HydroDataIn[nexport].Pos[k] = P[i].Pos[k];
						HydroDataIn[nexport].Vel[k] = SphP[i].VelPred[k];
					}
					HydroDataIn[nexport].Hsml = SphP[i].Hsml;
					HydroDataIn[nexport].Mass = P[i].Mass;
					HydroDataIn[nexport].DhsmlDensityFactor = SphP[i].DhsmlDensityFactor;
					HydroDataIn[nexport].Density = SphP[i].Density;
					HydroDataIn[nexport].Pressure = SphP[i].Pressure;
					HydroDataIn[nexport].Timestep = P[i].Ti_endstep - P[i].Ti_begstep;

					/* calculation of F1 */
					soundspeed_i = sqrt(GAMMA * SphP[i].Pressure / SphP[i].Density);
					HydroDataIn[nexport].F1 = fabs(SphP[i].DivVel) /
						(fabs(SphP[i].DivVel) + SphP[i].CurlVel +
						0.0001 * soundspeed_i / SphP[i].Hsml / fac_mu);

					HydroDataIn[nexport].Index = i;
					HydroDataIn[nexport].Task = j;
					nexport++;
					nsend_local[j]++;
				}
			}
		}
		#ifdef CUDA_GX_CHUNCK_MANAGER_SPH
			ManageChuncks(1);
		#endif
	} else {
		///////////////// GX //////////////////////

		cpytime=GetTime();

		ASSERT_GX(s_gx.cudamode>0);
		if (i!=0) ERROR("cuda mode does not support iterations in hydro calc, try to increasing the 'BufferSize' in the parameter file to surcomevent this problem");

		const int Np2=InitializeHydraCalculation_gx(NumPart,P,SphP,N_gas,hubble_a2, fac_mu, fac_vsic_fix
			#ifdef PERIODIC
				,boxSize,boxHalf
			#endif
		);

		if (Np2==0) WARNING("no sph particles participate in this timestep");
		ASSERT_GX( Np2==Np );

		cpytime = GetTime()-cpytime;
		subtime=GetTime();

		hydro_evaluate_range_cuda_gx(0,N_gas,s_gx,p_gx,h_gx);
		subtime=GetTime()-subtime;

		for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeHydro - NTask; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		{
			ndone++;

			for(j = 0; j < NTask; j++)
				Exportflag[j] = 0;

			ASSERT_GX( P[i].Type==0 );
			//hydro_evaluate_cuda_gx(i, 0,&s_gx,&p_gx);

			TimerUpdateCounter(91,1);

			ASSERT_GX(i<s_gx.sz_result_hydro);
			const struct result_hydro_gx r=s_gx.result_hydro[i];
			ASSERT_GX( isResultHydraDataOK(r,__FILE__,__LINE__) );

			for(k = 0; k < 3; k++) SphP[i].HydroAccel[k] = r.Acc[k];
			SphP[i].DtEntropy = r.DtEntropy;
			SphP[i].MaxSignalVel = r.MaxSignalVel;

			if (s_gx.NTask>1){
				for(j = 0; j < NTask; j++)
				{
					const char export_this=GetExportflag_gx(&s_gx,i,NTask,j);
					if(export_this)
					{
						for(k = 0; k < 3; k++)
						{
							HydroDataIn[nexport].Pos[k] = P[i].Pos[k];
							HydroDataIn[nexport].Vel[k] = SphP[i].VelPred[k];
						}
						HydroDataIn[nexport].Hsml = SphP[i].Hsml;
						HydroDataIn[nexport].Mass = P[i].Mass;
						HydroDataIn[nexport].DhsmlDensityFactor = SphP[i].DhsmlDensityFactor;
						HydroDataIn[nexport].Density = SphP[i].Density;
						HydroDataIn[nexport].Pressure = SphP[i].Pressure;
						HydroDataIn[nexport].Timestep = P[i].Ti_endstep - P[i].Ti_begstep;

						// calculation of F1
						soundspeed_i = sqrt(GAMMA * SphP[i].Pressure / SphP[i].Density);
						HydroDataIn[nexport].F1 = fabs(SphP[i].DivVel) /
							(fabs(SphP[i].DivVel) + SphP[i].CurlVel +
							0.0001 * soundspeed_i / SphP[i].Hsml / fac_mu);

						HydroDataIn[nexport].Index = i;
						HydroDataIn[nexport].Task = j;
						nexport++;
						nsend_local[j]++;
					}
				}
			}
		}
		///////////////// GX //////////////////////
	}
	TimerEnd(93);

      tend = second();
      timecomp += timediff(tstart, tend);

	///////////////// GX //////////////////////
	PrintInfoFinalize(s_gx,ndone,Np,starttime,cpytime,subtime,1,iter,-1,0,0,nexport,0,0,0);
	subtime=-1;
	///////////////// GX //////////////////////

      qsort(HydroDataIn, nexport, sizeof(struct hydrodata_in), hydro_compare_key);

      for(j = 1, noffset[0] = 0; j < NTask; j++)
	noffset[j] = noffset[j - 1] + nsend_local[j - 1];

      tstart = second();

      MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD);

      tend = second();
      timeimbalance += timediff(tstart, tend);

	TimerEnd(91);
	TimerBeg(92);

      /* now do the particles that need to be exported */

      for(level = 1; level < (1 << PTask); level++)
	{
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeHydro)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&HydroDataIn[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct hydrodata_in), MPI_BYTE,
				   recvTask, TAG_HYDRO_A,
				   &HydroDataGet[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct hydrodata_in), MPI_BYTE,
				   recvTask, TAG_HYDRO_A, MPI_COMM_WORLD, &status);
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);


	  /* now do the imported particles */
	  tstart = second();

	///////////////// GX //////////////////////
	// Do exported particles on the CPU/GPU
	TimerBeg(94);
	{
		AssertsOnhasGadgetDataBeenModified_gx(1,1,0);

		#if CUDA_DEBUG_GX>1
			MESSAGE("INFO: DistRMSGrav=%g",DistRMSGravdata(nbuffer[ThisTask],GravDataGet));
		#endif

		starttime=GetTime();
		const int N=nbuffer[ThisTask];

		if (N>0){
// YYY NOTE: disable GPU exportmode for now!!!
			if (1 || s_gx.cudamode==0 || N<MIN_SPH_PARTICLES_FOR_GPU_GX || Np<MIN_SPH_PARTICLES_FOR_GPU_GX) {
				for(j = 0; j < nbuffer[ThisTask]; j++)
					hydro_evaluate(j, 1);
			} else {
				cpytime=GetTime();

				InitializeHydraExportCalculation_gx(N,HydroDataGet);

				subtime=GetTime();
				hydro_evaluate_range_cuda_gx(1,N,s_gx,p_gx,h_gx);

				subtime=GetTime()-subtime;
				FinalizeHydraExportCalculation_gx(N);

				cpytime=GetTime()-cpytime-subtime;
			}

			PrintInfoFinalize(s_gx,0,N,starttime,cpytime,subtime,3,iter,level,0,0,nexport,0,0,0);
			subtime=-1;
		}
	}
	TimerEnd(94);
	///////////////// GX //////////////////////

	  tend = second();
	  timecomp += timediff(tstart, tend);


	  /* do a block to measure imbalance */
	  TimerBeg(95);
	  tstart = second();
	  MPI_Barrier(MPI_COMM_WORLD);
	  tend = second();
	  timeimbalance += timediff(tstart, tend);
	  TimerEnd(95);

	  /* get the result */
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeHydro)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&HydroDataResult[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct hydrodata_out),
				   MPI_BYTE, recvTask, TAG_HYDRO_B,
				   &HydroDataPartialResult[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct hydrodata_out),
				   MPI_BYTE, recvTask, TAG_HYDRO_B, MPI_COMM_WORLD, &status);

		      /* add the result to the particles */
		      for(j = 0; j < nsend_local[recvTask]; j++)
			{
			  source = j + noffset[recvTask];
			  place = HydroDataIn[source].Index;

			  for(k = 0; k < 3; k++)
			    SphP[place].HydroAccel[k] += HydroDataPartialResult[source].Acc[k];

			  SphP[place].DtEntropy += HydroDataPartialResult[source].DtEntropy;

			  if(SphP[place].MaxSignalVel < HydroDataPartialResult[source].MaxSignalVel)
			    SphP[place].MaxSignalVel = HydroDataPartialResult[source].MaxSignalVel;
			}
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);

	  level = ngrp - 1;
	}
	TimerEnd(92);

      MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD);
      for(j = 0; j < NTask; j++)
	ntotleft -= ndonelist[j];
    }


  free(ndonelist);
  free(nsend);
  free(nsend_local);
  free(nbuffer);
  free(noffset);

  /* do final operations on results */
  tstart = second();

  for(i = 0; i < N_gas; i++)
    if(P[i].Ti_endstep == All.Ti_Current)
      {
	SphP[i].DtEntropy *= GAMMA_MINUS1 / (hubble_a2 * pow(SphP[i].Density, GAMMA_MINUS1));
#ifdef SPH_BND_PARTICLES
	if(P[i].ID == 0)
	  {
	    SphP[i].DtEntropy = 0;
	    for(k = 0; k < 3; k++)
	      SphP[i].HydroAccel[k] = 0;
	  }
#endif
      }

  tend = second();
  timecomp += timediff(tstart, tend);

  /* collect some timing information */

  MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

  if(ThisTask == 0)
    {
      All.CPU_HydCompWalk += sumt / NTask;
      All.CPU_HydCommSumm += sumcomm / NTask;
      All.CPU_HydImbalance += sumimbalance / NTask;
    }

   TimerEnd(90);

	#ifdef RESULT_FILE_DUMP_GX
		static FILE* resultfile=NULL;

		if (resultfile==NULL) {
			char filename[256];
			sprintf(filename,"resultfile.%d.%d.txt",s_gx.cudamode,ThisTask);
			resultfile=fopen(filename,"w");
		}
		else{
			MESSAGE("Dumping result...");
			static int MM=0;
			int j;
			fprintf(resultfile,"Dumping result...N_gas=%d\n",N_gas);
			for(j=0;j<N_gas;++j)
			if(P[j].Ti_endstep == All.Ti_Current){
				static int NN=0;
				const int target=j;
				fprintf(resultfile,"m=0, NN=%6d, t=%6d, e=%.4g, v=%.4g, acc={%.2g,%.2g,%.2g}\n",NN++,target,SphP[target].DtEntropy,SphP[target].MaxSignalVel,SphP[target].HydroAccel[0],SphP[target].HydroAccel[1],SphP[target].HydroAccel[2]);
				fflush(resultfile);
			}
			if (++MM>2) exit(-42);
		}
	#endif

	#ifdef CUDA_GX_NO_SPH_SUPPORT
		s_gx.cudamode=oldcudamode;
	#endif

	//MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f  -  %5.1f, %5.1f, %5.1f, %5.1f %c sph timers d 90,93,94,95,net",TimerGet(90),TimerGet(93),TimerGet(94),TimerGet(95),TimerGet(90)-TimerGet(93)-TimerGet(94),100.0*TimerGet(93)/TimerGet(90),100.0*TimerGet(94)/TimerGet(90),100.0*TimerGet(95)/TimerGet(90),100.0*(TimerGet(90)-TimerGet(93)-TimerGet(94))/TimerGet(90),'%');
	//MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f  -  %5.1f, %5.1f, %5.1f, %5.1f %c sph timers a 90,93,94,95,net",TimerGetAccumulated(90),TimerGetAccumulated(93),TimerGetAccumulated(94),TimerGetAccumulated(95),TimerGetAccumulated(90)-TimerGetAccumulated(93)-TimerGetAccumulated(94),100.0*TimerGetAccumulated(93)/TimerGetAccumulated(90),100.0*TimerGetAccumulated(94)/TimerGetAccumulated(90),100.0*TimerGetAccumulated(95)/TimerGetAccumulated(90),100.0*(TimerGetAccumulated(90)-TimerGetAccumulated(93)-TimerGetAccumulated(94))/TimerGetAccumulated(90),'%');
}
Exemplo n.º 4
0
Object* Context::getVar(const Symbol &name, bool inherit) const {
  TimerStart("getVar");
#ifdef _CTXH
  Object *o = 0;
  if (mVars.getValue(name, o)) {
    if (o) {
      o->incRef();
    }
    TimerEnd("getVar");
    return o;
#else
  ObjectMap::const_iterator it = mVars.find(name);
  if (it != mVars.end()) {
    Object *o = it->second;
    if (o) {
      o->incRef();
    }
    TimerEnd("getVar");
    return o;
#endif
  } else {
    if (mParent && inherit) {
      Object *o = mParent->getVar(name);
      TimerEnd("getVar");
      return o;
    } else {
      TimerEnd("getVar");
      return 0;
    }
  }
}

Callable* Context::getCallable(const Symbol &name, bool inherit) const {
  TimerStart("getCallable");
#ifdef _CTXH
  Object *o = 0;
  if (mVars.getValue(name, o) && o && o->isCallable()) {
    o->incRef();
    TimerEnd("getCallable");
    return (Callable*) o;
#else
  ObjectMap::const_iterator it = mVars.find(name);
  if (it != mVars.end() && it->second && it->second->isCallable()) {
    Object *o = it->second;
    o->incRef();
    TimerEnd("getCallable");
    return (Callable*) o;
#endif
  } else {
    if (mParent && inherit) {
      Callable *c = mParent->getCallable(name);
      TimerEnd("getCallable");
      return c;
    } else {
      TimerEnd("getCallable");
      return 0;
    }
  }
}

void Context::toStream(std::ostream &os, const std::string &indent) const {
  TimerStart("toStream");
#ifdef _CTXH
  ObjectMap::KeyValueVector kv;
  size_t n = mVars.getPairs(kv);
  for (size_t i=0; i<n; ++i) {
    os << indent << "\"" << kv[i].first << "\" = ";
    kv[i].second->toStream(os);
    os << std::endl;
  }
#else
  ObjectMap::const_iterator it = mVars.begin();
  while (it != mVars.end()) {
    os << indent << "\"" << it->first << "\" = ";
    it->second->toStream(os);
    os << std::endl;
    ++it;
  }
#endif
  if (mParent != 0) {
    os << indent << "From parent context:" << std::endl;
    mParent->toStream(os, indent+"  ");
  }
  TimerEnd("toStream");
}
Exemplo n.º 5
0
bool Context::hasVar(const Symbol &name, bool inherit) const {
  TimerStart("hasVar");
#ifdef _CTXH
  if (mVars.hasKey(name)) {
#else
  if (mVars.find(name) != mVars.end()) {
#endif
    return true;
  } else if (mParent && inherit) {
    return mParent->hasVar(name, true);
  } else {
    return false;
  }
  TimerEnd("hasVar");
}

void Context::setVar(const Symbol &name, Object *v, bool inherit) {
  TimerStart("setVar");
#ifdef _CTXH
  ObjectMap::Entry *e = mVars.find(name);
  if (e) {
    if (e->second != v) {
      if (e->second) {
        e->second->decRef();
      }
      e->second = v;
    } else {
      TimerEnd("setVar");
      return;
    }
  } else {
    if (mParent && inherit && mParent->hasVar(name, true)) {
      mParent->setVar(name, v, true);
      TimerEnd("setVar");
      return;
    }
    mVars.insert(name, v);
  }
#else
  ObjectMap::iterator it = mVars.find(name);
  if (it != mVars.end()) {
    if (it->second != v) {
      if (it->second) {
        it->second->decRef();
      }
      it->second = v;
    } else {
      TimerEnd("setVar");
      return;
    }
  } else {
    if (mParent && inherit && mParent->hasVar(name, true)) {
      mParent->setVar(name, v, true);
      TimerEnd("setVar");
      return;
    }
    mVars[name] = v;
  }
#endif
  /*
  if (mParent && inherit && mParent->hasVar(name, true)) {
    mParent->setVar(name, v, true);
    TimerEnd("setVar");
    return;
  }
#ifdef _CTXH
  ObjectMap::Entry *e = mVars.find(name);
  if (e) {
    if (e->second != v) {
      if (e->second) {
        e->second->decRef();
      }
      e->second = v;
    } else {
      TimerEnd("setVar");
      return;
    }
  } else {
    mVars.insert(name, v);
  }
#else
  ObjectMap::iterator it = mVars.find(name);
  if (it != mVars.end()) {
    if (it->second != v) {
      if (it->second) {
        it->second->decRef();
      }
      it->second = v;
    } else {
      TimerEnd("setVar");
      return;
    }
  } else {
    mVars[name] = v;
  }
#endif
  */
  if (v) {
    v->incRef();
  }
  TimerEnd("setVar");
}
Exemplo n.º 6
0
void GLWindow_Mainloop(void)
{
    USE_HIGH_PERFORMANCE_TIMER = EmulatorConfig.highperformancetimer;

    TimerInit();

    SCREEN_TEXTURE = calloc(256*224,4); //max possible size
    int scr_texture_loaded = 0;
    GLuint scr_texture;

    //do { GBA_RunFor(1); } while(GBA_MemoryReadFast16(CPU.R[R_PC]) != 0xDF05); //swi 0x05
    //CPU.R[R_PC] = 0x00000000;
    //do { GBA_RunFor(1); } while(CPU.R[R_PC] != 0x080002B0); GLWindow_GBACreateDissasembler();

    while(1)
    {
        if(GLWindow_HandleEvents()) break;

        if(GLWindow_Active && (PAUSED == 0))
        {
            if(RUNNING == RUN_GBA)
            {
                GLWindow_GBADisassemblerStartAddressSetDefault();

                GLWindow_GBAHandleInput();
                GBA_CheckKeypadInterrupt();

                GBA_RunFor(280896); //clocksperframe = 280896

                if(Keys_Down[VK_SPACE])
                {
                    GBA_SetFrameskip(10);
                    GBA_SoundResetBufferPointers();
                }
                else GBA_SetFrameskip(FRAMESKIP);

                if(GBA_HasToSkipFrame()==0)
                {
                    GBA_ConvertScreenBufferTo32RGB(SCREEN_TEXTURE);

                    glClear(GL_COLOR_BUFFER_BIT); //Clear screen

                    if(scr_texture_loaded) glDeleteTextures(1,&scr_texture);
                    scr_texture_loaded = 1;

                    glGenTextures(1,&scr_texture);
                    glBindTexture(GL_TEXTURE_2D,scr_texture);
                    if(EmulatorConfig.oglfilter)
                    {
                        glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR);
                        glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
                    }
                    else
                    {
                        glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_NEAREST);
                        glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_NEAREST);
                    }
                    glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_CLAMP);
                    glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_CLAMP);

                    glTexImage2D(GL_TEXTURE_2D,0,4,240,160, 0,GL_RGBA,GL_UNSIGNED_BYTE,SCREEN_TEXTURE);

                    glBindTexture(GL_TEXTURE_2D,scr_texture);
                    glColor3f(1.0,1.0,1.0);
                    glBegin( GL_QUADS );
                        glTexCoord2f(0,0); // Top-left vertex
                        glVertex3f(0,0,0);

                        glTexCoord2f(1,0); // Bottom-left vertex
                        glVertex3f(240,0,0);

                        glTexCoord2f(1,1); // Bottom-right vertex
                        glVertex3f(240,160,0);

                        glTexCoord2f(0,1); // Top-right vertex
                        glVertex3f(0,160,0);
                    glEnd();

                    GLWindow_SwapBuffers();
                }

                GBA_UpdateFrameskip();

                //GLWindow_MemViewerUpdate();
                //GLWindow_IOViewerUpdate();
                //GLWindow_DisassemblerUpdate();

                TimerWait(Keys_Down[VK_SPACE] == 0);
            }
            else
            {
                glClear(GL_COLOR_BUFFER_BIT); //Clear screen
                GLWindow_SwapBuffers();
                Sleep(100);
            }
        }
        else
        {
            Sleep(1); // Allow the CPU to rest a bit :P
        }
/*
if(RUNNING == RUN_GBA)
{
        if(Keys_Down['Q']) { int k = 10; while(k--) GLWindow_GBADisassemblerStep(); GLWindow_GBADisassemblerUpdate(); }
        if(Keys_Down['W']) { int k = 100; while(k--) GLWindow_GBADisassemblerStep(); GLWindow_GBADisassemblerUpdate(); }
        if(Keys_Down['R']) { int k = 1000; while(k--) GLWindow_GBADisassemblerStep(); GLWindow_GBADisassemblerUpdate(); }
}
*/
    }

    GLWindow_UnloadRom(1);

    if(scr_texture_loaded) glDeleteTextures(1,&scr_texture);

    TimerEnd();
}
Exemplo n.º 7
0
void main() {
//	char c[]="Hello World";
//	Queue* q = InitQueue();
	int i=0;
//	for (i=0;i<5;i++) {QueuePush(q,&(c[i]));}
//	for (i=0;i<5;i++) printf("%c",*(char*)QueuePop(q));
//	for (i=5;i<11;i++) {QueuePush(q,&(c[i]));}
//	for (i=0;i<5;i++) printf("%c",*(char*)QueuePop(q));
//
//	printf("%d\n",q->size);
//	DeleteQueue(q);

	IMG_Init(0);
	SDL_Surface* sf = IMG_Load("1.jpg");
	IMG_Quit();

	
	SDL_Init(SDL_INIT_EVERYTHING);
	SDL_Surface* screen = SDL_SetVideoMode( FRAME_WIDTH*2, FRAME_HEIGHT, 24, SDL_SWSURFACE );
	SDL_Rect r = {0,0,FRAME_WIDTH,FRAME_HEIGHT};
	SDL_Event event;
	char buf[256];
	Pixmap* px = PixmapFromSdlSurface(sf);
#if USE_BITFIELD
	printf("Using bitfield optimization ...\n");
	Pixmap* px2 = PixmapThresholding(px,0.2);
#else
	Pixmap* px2 = PixmapThresholdingSimple(px,0.2);
	SavePixmap(px2,"1.bmp");
#endif
	Pixmap* dup = PixmapCopy(px2);
	Timer* timer = TimerStart();
	for (i=0;i<1;i++) {
#if USE_BITFIELD
		PixmapErosion(px2,1);
#else
		PixmapErosionSimple(px2,1);
#endif
	}
	SavePixmap(px2,"2.bmp");
	printf("time taken: %fs\n",TimerEnd(timer));
//	SDL_BlitSurface(sf,&r,screen,NULL);
	SDL_Surface* res = SdlSurfaceFromPixmap(dup);
	SDL_BlitSurface(res,&r,screen,NULL);
	r.x=FRAME_WIDTH;
	res = SdlSurfaceFromPixmap(px2);
	SDL_BlitSurface(res,NULL,screen,&r);
	SDL_Flip(screen);

	while (1) {
		if (SDL_PollEvent(&event)) {
			if (event.type==SDL_QUIT) break;
		}
	}

	printf("Memory used: %s\n",GetMemoryRepr(buf,MemoryInfo()));
	DeletePixmap(px);
	DeletePixmap(px2);
//	SDL_FreeSurface(res);
//	SDL_FreeSurface(sf);
//	SDL_FreeSurface(screen);
	SDL_Quit();
	printf("Memory used: %s\n",GetMemoryRepr(buf,MemoryInfo()));


}
Exemplo n.º 8
0
/*! This function computes the gravitational forces for all active
 *  particles.  If needed, a new tree is constructed, otherwise the
 *  dynamically updated tree is used.  Particles are only exported to other
 *  processors when really needed, thereby allowing a good use of the
 *  communication buffer.
 */
void gravity_tree(void)
{
	int tim=20; // GX mod, timer to profile calls

	TimerBeg(29);
	TimerBeg(tim);

	long long ntot;
	int numnodes, nexportsum = 0;
	int i, j, iter = 0;
	int *numnodeslist, maxnumnodes, nexport, *numlist, *nrecv, *ndonelist;
	double tstart, tend, timetree = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance;
	double ewaldcount;
	double costtotal, ewaldtot, *costtreelist, *ewaldlist;
	double maxt, sumt, *timetreelist, *timecommlist;
	double fac, plb, plb_max, sumcomm;

	#ifndef NOGRAVITY
		int *noffset, *nbuffer, *nsend, *nsend_local;
		long long ntotleft;
		int ndone,maxfill, ngrp;
		int k, place;
		int level, sendTask, recvTask;
		double ax, ay, az;
		MPI_Status status;
	#endif

	///////////////// GX //////////////////////
	int totdone=0;
	#if CUDA_DEBUG_GX>0
		int not_timestepped_gx=0;
		int exporthash_gx=0;
		int count_exported_gx=0;
	#endif
	///////////////// GX //////////////////////

	/* set new softening lengths */
	if(All.ComovingIntegrationOn)
		set_softenings();

	/* contruct tree if needed */
	tstart = second();
	if(TreeReconstructFlag)
	{
		if(ThisTask == 0)
		printf("Tree construction.\n");

		force_treebuild(NumPart);

		TreeReconstructFlag = 0;

		if(ThisTask == 0)
		printf("Tree construction done.\n");
	}
	tend = second();
	All.CPU_TreeConstruction += timediff(tstart, tend);

	costtotal = ewaldcount = 0;

	/* Note: 'NumForceUpdate' has already been determined in find_next_sync_point_and_drift() */
	numlist = malloc(NTask * sizeof(int) * NTask);

	MPI_Allgather(&NumForceUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);

	for(i = 0, ntot = 0; i < NTask; i++)
		ntot += numlist[i];
	free(numlist);

	#ifndef NOGRAVITY
	if(ThisTask == 0)
		printf("Begin tree force.\n");

	#ifdef SELECTIVE_NO_GRAVITY
		for(i = 0; i < NumPart; i++)
			if(((1 << P[i].Type) & (SELECTIVE_NO_GRAVITY)))
				P[i].Ti_endstep = -P[i].Ti_endstep - 1;
	#endif

	noffset = malloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
	nbuffer = malloc(sizeof(int) * NTask);
	nsend_local = malloc(sizeof(int) * NTask);
	nsend = malloc(sizeof(int) * NTask * NTask);
	ndonelist = malloc(sizeof(int) * NTask);

	i = 0;           /* begin with this index */
	ntotleft = ntot; /* particles left for all tasks together */

	TimerEnd(tim++);

	///////////////// GX //////////////////////
	// if (s_gx.cudamode>0 && All.MaxPart>1400000) TimersSleep(10); // GPU card runs hot on large sims, this is around N_p=1404928
	// if (s_gx.cudamode>0) TimersSleep(10);
	TimerBeg(tim);

	double starttime,subtime=-1,cpytime=-1;
	int Np=-1;
	int buffered=0;

	if(s_gx.cudamode>0)
	{
		FUN_MESSAGE(2,"gravity_tree()");

		TimerBeg(50);
		cpytime=GetTime();

		Np=InitializeProlog_gx(NumPart);

		TimerEnd(50);
		cpytime=GetTime()-cpytime;
	}
	///////////////// GX //////////////////////

	while(ntotleft > 0)
	{
		TimerBeg(31);
		starttime=GetTime();

		iter++;

		for(j = 0; j < NTask; j++)
			nsend_local[j] = 0;

		/* do local particles and prepare export list */
		tstart = second();

		if (s_gx.cudamode==0 || Np<MIN_FORCE_PARTICLES_FOR_GPU_GX) {
			ASSERT_GX( !buffered );

			ReLaunchChunkManager();

			for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) {
				if(P[i].Ti_endstep == All.Ti_Current)
				{
					ndone++;

					for(j = 0; j < NTask; j++)
						Exportflag[j] = 0;

					TimerUpdateCounter(31,1);
					#ifndef PMGRID
						costtotal += force_treeevaluate(i, 0, &ewaldcount);
					#else
						costtotal += force_treeevaluate_shortrange(i, 0 );
					#endif

					#if CUDA_DEBUG_GX>0
						int flagexported_gx=0;
					#endif
					for(j = 0; j < NTask; j++)
					{
						if(Exportflag[j])
						{
							ASSERT_GX( NTask>1 );
							#if CUDA_DEBUG_GX>0
								flagexported_gx=1;
								exporthash_gx += (i-j)*(j+ThisTask+1);
							#endif

							for(k = 0; k < 3; k++)
							GravDataGet[nexport].u.Pos[k] = P[i].Pos[k];
							#ifdef UNEQUALSOFTENINGS
								GravDataGet[nexport].Type = P[i].Type;
								#ifdef ADAPTIVE_GRAVSOFT_FORGAS
									if(P[i].Type == 0)
									GravDataGet[nexport].Soft = SphP[i].Hsml;
								#endif
							#endif
							GravDataGet[nexport].w.OldAcc = P[i].OldAcc;
							GravDataIndexTable[nexport].Task = j;
							GravDataIndexTable[nexport].Index = i;
							GravDataIndexTable[nexport].SortIndex = nexport;
							nexport++;
							nexportsum++;
							nsend_local[j]++;
						}
					}
					#if CUDA_DEBUG_GX>0
						if (flagexported_gx) ++count_exported_gx;
					#endif
				}
				#if CUDA_DEBUG_GX>0
					else ++not_timestepped_gx;
				#endif
			}
			ManageChuncks(0);
		} else {
			///////////////// GX //////////////////////
			// cudamode>0
			///////////////// GX //////////////////////
			#ifndef PMGRID
				// WARNING Attemping to run in tree-only mode, examine results carefully
				// ERROR cannot run in non PMGRID mode
			#endif

			if (iter==1){
				const double tx=GetTime();
				TimerBeg(51);

				ASSERT_GX(NumPart>=i);
				ASSERT_GX(!buffered);

				if (iter!=1) ERROR("cuda mode does not support iterations in gravtree calc, try to increasing the 'BufferSize' in the parameter file to surcomevent this problem");

				const int Np2=InitializeCalculation_gx(NumPart,P,0);
				ASSERT_GX( Np2==Np );
				if (Np2==0) WARNING("no particles participate in this timestep");

				TimerEnd(51);

				cpytime += GetTime() - tx;
				subtime=GetTime();
				TimerBeg(52);

				force_treeevaluate_shortrange_range_gx(0, Np);
				buffered=1;

				TimerUpdateCounter(31,NumPart-i);
				TimerEnd(52);

				subtime = GetTime() - subtime;
			} else {
				cpytime=-1;
				subtime=-1;
				ASSERT_GX(buffered);
			}

			for(nexport = 0, ndone = 0; i < NumPart &&  nexport < All.BunchSizeForce - NTask; i++) {
				if(P[i].Ti_endstep == All.Ti_Current)
				{
					ndone++;

					ASSERT_GX( i<NumPart );
					ASSERT_GX( buffered );

					const struct result_gx r=GetTarget(totdone++,i); // s_gx.result[target];

					P[i].GravAccel[0] = r.acc_x;
					P[i].GravAccel[1] = r.acc_y;
					P[i].GravAccel[2] = r.acc_z;
					P[i].GravCost = r.ninteractions;
					costtotal += r.ninteractions;

					if (s_gx.NTask>1) {
						#if CUDA_DEBUG_GX>0
							int flagexported_gx=0;
						#endif
						for(j = 0; j < NTask; j++) {
							if (GetExportflag_gx(&s_gx,i,NTask,j)){
								ASSERT_GX( NTask>1 );
								#if CUDA_DEBUG_GX>0
									flagexported_gx=1;
									exporthash_gx += (i-j)*(j+ThisTask+1);
								#endif

								for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k];
								#ifdef UNEQUALSOFTENINGS
								GravDataGet[nexport].Type = P[i].Type;
									#ifdef ADAPTIVE_GRAVSOFT_FORGAS
										if(P[i].Type == 0) GravDataGet[nexport].Soft = SphP[i].Hsml;
									#endif
								#endif
								GravDataGet[nexport].w.OldAcc = P[i].OldAcc;
								GravDataIndexTable[nexport].Task = j;
								GravDataIndexTable[nexport].Index = i;
								GravDataIndexTable[nexport].SortIndex = nexport;
								nexport++;
								nexportsum++;
								nsend_local[j]++;
							}
						}
						#if CUDA_DEBUG_GX>0
							if (flagexported_gx) ++count_exported_gx;
						#endif
					}
				}
				#if CUDA_DEBUG_GX>0
					else ++not_timestepped_gx;
				#endif
			}
			AssertsOnhasGadgetDataBeenModified_gx(0,1,0);
		}
		TimerEnd(31);

		///////////////// GX //////////////////////
		if (iter==1 || !buffered){
				PrintInfoFinalize(s_gx,ndone,Np,starttime,cpytime,subtime,0,iter,-1
				#if CUDA_DEBUG_GX>0
					,not_timestepped_gx,count_exported_gx,nexport,nexportsum,exporthash_gx,costtotal
				#else
					,0,0,0,0,0,0
				#endif
				);
			subtime=-1;
		}

		TimerBeg(39);
		///////////////// GX //////////////////////

		tend = second();
		timetree += timediff(tstart, tend);

		qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key);

		for(j = 0; j < nexport; j++)
			GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex];

		for(j = 1, noffset[0] = 0; j < NTask; j++)
			noffset[j] = noffset[j - 1] + nsend_local[j - 1];

		tstart = second();

		MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD);

		tend = second();
		timeimbalance += timediff(tstart, tend);

		/* now do the particles that need to be exported */

		for(level = 1; level < (1 << PTask); level++)
		{
			tstart = second();
			for(j = 0; j < NTask; j++)
				nbuffer[j] = 0;

			for(ngrp = level; ngrp < (1 << PTask); ngrp++)
			{
				maxfill = 0;
				for(j = 0; j < NTask; j++)
				{
					if((j ^ ngrp) < NTask)
						if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
							maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
				}
				if(maxfill >= All.BunchSizeForce)
				break;

				sendTask = ThisTask;
				recvTask = ThisTask ^ ngrp;

				if(recvTask < NTask)
				{
					if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
					{
						/* get the particles */
						MPI_Sendrecv(&GravDataIn[noffset[recvTask]],
						nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE,
						recvTask, TAG_GRAV_A,
						&GravDataGet[nbuffer[ThisTask]],
						nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE,
						recvTask, TAG_GRAV_A, MPI_COMM_WORLD, &status);
					}
				}

				for(j = 0; j < NTask; j++)
					if((j ^ ngrp) < NTask)
						nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
			}
			tend = second();
			timecommsumm += timediff(tstart, tend);

			TimerBeg(30);
			TimerUpdateCounter(30,nbuffer[ThisTask]);

			tstart = second();
			///////////////// GX //////////////////////
			// Do exported particles on the CPU/GPU
			{
				AssertsOnhasGadgetDataBeenModified_gx(1,1,0);

				#if CUDA_DEBUG_GX>1
					MESSAGE("INFO: DistRMSGrav=%g",DistRMSGravdata(nbuffer[ThisTask],GravDataGet));
				#endif

				starttime=GetTime();
				const int N=nbuffer[ThisTask];

				if (N>0){
					if (s_gx.cudamode==0 || N<MIN_FORCE_PARTICLES_FOR_GPU_GX || Np<MIN_FORCE_PARTICLES_FOR_GPU_GX) {
						ReLaunchChunkManager();
						for(j = 0; j<N ; j++)
						{
							#ifndef PMGRID
								costtotal += force_treeevaluate(j, 1, &ewaldcount);
							#else
								costtotal += force_treeevaluate_shortrange(j, 1);
							#endif
						}
						ManageChuncks(0);
					} else {
						ASSERT_GX( buffered );

						cpytime=GetTime();
						InitializeExportCalculation_gx(N,P[0].Type);
						ASSERT_GX( N==s_gx.Np );

						subtime=GetTime();
						force_treeevaluate_shortrange_range_gx(1, N);
						subtime=GetTime()-subtime;

						costtotal += FinalizeExportCalculation_gx(N);
						cpytime=GetTime()-cpytime-subtime;

						ASSERT_GX( N==s_gx.Np );
					}

					PrintInfoFinalize(s_gx,0,N,starttime,cpytime,subtime,2,iter,level,0,0,nexport,0,0,0);
					subtime=-1;
				} else {
					ReLaunchChunkManager();
					ManageChuncks(0);
				}
			}
			///////////////// GX //////////////////////
			if (nbuffer[ThisTask]>0) TimerUpdateCounter(30,-1);
			TimerEnd(30);
			tend = second();
			timetree += timediff(tstart, tend);

			TimerBeg(33);
			tstart = second();

			MPI_Barrier(MPI_COMM_WORLD);
			tend = second();
			timeimbalance += timediff(tstart, tend);
			TimerEnd(33);

			/* get the result */
			tstart = second();
			for(j = 0; j < NTask; j++)
				nbuffer[j] = 0;
			for(ngrp = level; ngrp < (1 << PTask); ngrp++)
			{
				maxfill = 0;
				for(j = 0; j < NTask; j++)
				{
					if((j ^ ngrp) < NTask)
						if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
						maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
				}
				if(maxfill >= All.BunchSizeForce)
					break;

				sendTask = ThisTask;
				recvTask = ThisTask ^ ngrp;
				if(recvTask < NTask)
				{
					if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
					{
						/* send the results */
						MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]],
						nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in),
						MPI_BYTE, recvTask, TAG_GRAV_B,
						&GravDataOut[noffset[recvTask]],
						nsend_local[recvTask] * sizeof(struct gravdata_in),
						MPI_BYTE, recvTask, TAG_GRAV_B, MPI_COMM_WORLD, &status);

						/* add the result to the particles */
						for(j = 0; j < nsend_local[recvTask]; j++)
						{
							place = GravDataIndexTable[noffset[recvTask] + j].Index;
// comment out in order to disable export forces for debugging
							for(k = 0; k < 3; k++)
								P[place].GravAccel[k] += GravDataOut[j + noffset[recvTask]].u.Acc[k];

							P[place].GravCost += GravDataOut[j + noffset[recvTask]].w.Ninteractions;
						}
					}
				}

				for(j = 0; j < NTask; j++)
					if((j ^ ngrp) < NTask)
						nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];

			}
			tend = second();
			timecommsumm += timediff(tstart, tend);

			level = ngrp - 1;
		}

		MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD);

		for(j = 0; j < NTask; j++)
			ntotleft -= ndonelist[j];

		TimerEnd(39);
	}

	TimerEnd(tim++);
	TimerBeg(tim);

	free(ndonelist);
	free(nsend);
	free(nsend_local);
	free(nbuffer);
	free(noffset);

	/* now add things for comoving integration */

	#ifndef PERIODIC
		#ifndef PMGRID
			if(All.ComovingIntegrationOn)
			{
				fac = 0.5 * All.Hubble * All.Hubble * All.Omega0 / All.G;

				for(i = 0; i < NumPart; i++)
					if(P[i].Ti_endstep == All.Ti_Current)
						for(j = 0; j < 3; j++)
							P[i].GravAccel[j] += fac * P[i].Pos[j];
			}
		#endif
	#endif

	for(i = 0; i < NumPart; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		{
			#ifdef PMGRID
				ax = P[i].GravAccel[0] + P[i].GravPM[0] / All.G;
				ay = P[i].GravAccel[1] + P[i].GravPM[1] / All.G;
				az = P[i].GravAccel[2] + P[i].GravPM[2] / All.G;
			#else
				ax = P[i].GravAccel[0];
				ay = P[i].GravAccel[1];
				az = P[i].GravAccel[2];
			#endif
			P[i].OldAcc = sqrt(ax * ax + ay * ay + az * az);
		}

	if(All.TypeOfOpeningCriterion == 1)
		All.ErrTolTheta = 0;	/* This will switch to the relative opening criterion for the following force computations */

	/*  muliply by G */
	for(i = 0; i < NumPart; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		for(j = 0; j < 3; j++)
			P[i].GravAccel[j] *= All.G;


	/* Finally, the following factor allows a computation of a cosmological simulation
		with vacuum energy in physical coordinates */
	#ifndef PERIODIC
		#ifndef PMGRID
		if(All.ComovingIntegrationOn == 0)
		{
			fac = All.OmegaLambda * All.Hubble * All.Hubble;

			for(i = 0; i < NumPart; i++)
				if(P[i].Ti_endstep == All.Ti_Current)
				for(j = 0; j < 3; j++)
					P[i].GravAccel[j] += fac * P[i].Pos[j];
		}
		#endif
	#endif

	#ifdef SELECTIVE_NO_GRAVITY
		for(i = 0; i < NumPart; i++)
			if(P[i].Ti_endstep < 0)
				P[i].Ti_endstep = -P[i].Ti_endstep - 1;
	#endif

	if(ThisTask == 0)
		printf("tree is done.\n");

	#else /* gravity is switched off */

	for(i = 0; i < NumPart; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		for(j = 0; j < 3; j++)
			P[i].GravAccel[j] = 0;

	#endif

	/* Now the force computation is finished */

	/*  gather some diagnostic information */

	timetreelist = malloc(sizeof(double) * NTask);
	timecommlist = malloc(sizeof(double) * NTask);
	costtreelist = malloc(sizeof(double) * NTask);
	numnodeslist = malloc(sizeof(int) * NTask);
	ewaldlist = malloc(sizeof(double) * NTask);
	nrecv = malloc(sizeof(int) * NTask);

	numnodes = Numnodestree;

	MPI_Gather(&costtotal, 1, MPI_DOUBLE, costtreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
	MPI_Gather(&numnodes, 1, MPI_INT, numnodeslist, 1, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Gather(&timetree, 1, MPI_DOUBLE, timetreelist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
	MPI_Gather(&timecommsumm, 1, MPI_DOUBLE, timecommlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
	MPI_Gather(&NumPart, 1, MPI_INT, nrecv, 1, MPI_INT, 0, MPI_COMM_WORLD);
	MPI_Gather(&ewaldcount, 1, MPI_DOUBLE, ewaldlist, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);
	MPI_Reduce(&nexportsum, &nexport, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
	MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

	if(ThisTask == 0)
	{
		All.TotNumOfForces += ntot;

		fprintf(FdTimings, "Step= %d  t= %g  dt= %g \n", All.NumCurrentTiStep, All.Time, All.TimeStep);
		fprintf(FdTimings, "Nf= %d%09d  total-Nf= %d%09d  ex-frac= %g  iter= %d\n",
			(int) (ntot / 1000000000), (int) (ntot % 1000000000),
			(int) (All.TotNumOfForces / 1000000000), (int) (All.TotNumOfForces % 1000000000),
			nexport / ((double) ntot), iter);
		/* note: on Linux, the 8-byte integer could be printed with the format identifier "%qd", but doesn't work on AIX */

		fac = NTask / ((double) All.TotNumPart);

		for(i = 0, maxt = timetreelist[0], sumt = 0, plb_max = 0,
		maxnumnodes = 0, costtotal = 0, sumcomm = 0, ewaldtot = 0; i < NTask; i++)
		{
			costtotal += costtreelist[i];

			sumcomm += timecommlist[i];

			if(maxt < timetreelist[i])
				maxt = timetreelist[i];
			sumt += timetreelist[i];

			plb = nrecv[i] * fac;

			if(plb > plb_max)
				plb_max = plb;

			if(numnodeslist[i] > maxnumnodes)
				maxnumnodes = numnodeslist[i];

			ewaldtot += ewaldlist[i];
		}
		fprintf(FdTimings, "work-load balance: %g  max=%g avg=%g PE0=%g\n",
			maxt / (sumt / NTask), maxt, sumt / NTask, timetreelist[0]);
		fprintf(FdTimings, "particle-load balance: %g\n", plb_max);
		fprintf(FdTimings, "max. nodes: %d, filled: %g\n", maxnumnodes,
			maxnumnodes / (All.TreeAllocFactor * All.MaxPart));
		fprintf(FdTimings, "part/sec=%g | %g  ia/part=%g (%g)\n", ntot / (sumt + 1.0e-20),
			ntot / (maxt * NTask), ((double) (costtotal)) / ntot, ((double) ewaldtot) / ntot);
		fprintf(FdTimings, "\n");

		fflush(FdTimings);

		All.CPU_TreeWalk += sumt / NTask;
		All.CPU_Imbalance += sumimbalance / NTask;
		All.CPU_CommSum += sumcomm / NTask;
	}

	free(nrecv);
	free(ewaldlist);
	free(numnodeslist);
	free(costtreelist);
	free(timecommlist);
	free(timetreelist);

	ASSERT_GX( tim==22 );
	TimerEnd(tim++);
	TimerEnd(29);

	//MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f  -  %5.1f, %5.1f, %5.1f, %5.1f %c force timers d 29,31,30,33,net",TimerGet(29),TimerGet(31),TimerGet(30),TimerGet(33),TimerGet(29)-TimerGet(31)-TimerGet(30),100.0*TimerGet(31)/TimerGet(29),100.0*TimerGet(30)/TimerGet(29),100.0*TimerGet(33)/TimerGet(29),100.0*(TimerGet(29)-TimerGet(31)-TimerGet(30))/TimerGet(29),'%');
	//MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f  -  %5.1f, %5.1f, %5.1f, %5.1f %c force timers a 29,31,30,33,net",TimerGetAccumulated(29),TimerGetAccumulated(31),TimerGetAccumulated(30),TimerGetAccumulated(33),TimerGetAccumulated(29)-TimerGetAccumulated(31)-TimerGetAccumulated(30),100.0*TimerGetAccumulated(31)/TimerGetAccumulated(29),100.0*TimerGetAccumulated(30)/TimerGetAccumulated(29),100.0*TimerGetAccumulated(33)/TimerGetAccumulated(29),100.0*(TimerGetAccumulated(29)-TimerGetAccumulated(31)-TimerGetAccumulated(30))/TimerGetAccumulated(29),'%');
}