Exemplo n.º 1
0
Arquivo: hydra.c Projeto: Ingwar/amuse
/*! This function is the driver routine for the calculation of hydrodynamical
 *  force and rate of change of entropy due to shock heating for all active
 *  particles .
 */
void hydro_force(void)
{
  long long ntot, ntotleft;
  int i, j, k, n, ngrp, maxfill, source, ndone;
  int *nbuffer, *noffset, *nsend_local, *nsend, *numlist, *ndonelist;
  int level, sendTask, recvTask, nexport, place;
  double soundspeed_i;
  double tstart, tend, sumt, sumcomm;
  double timecomp = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance;
#ifndef NOMPI
  MPI_Status status;
#endif

#ifdef PERIODIC
  boxSize = All.BoxSize;
  boxHalf = 0.5 * All.BoxSize;
#ifdef LONG_X
  boxHalf_X = boxHalf * LONG_X;
  boxSize_X = boxSize * LONG_X;
#endif
#ifdef LONG_Y
  boxHalf_Y = boxHalf * LONG_Y;
  boxSize_Y = boxSize * LONG_Y;
#endif
#ifdef LONG_Z
  boxHalf_Z = boxHalf * LONG_Z;
  boxSize_Z = boxSize * LONG_Z;
#endif
#endif

  if(All.ComovingIntegrationOn)
    {
      /* Factors for comoving integration of hydro */
      hubble_a = All.Omega0 / (All.Time * All.Time * All.Time)
	+ (1 - All.Omega0 - All.OmegaLambda) / (All.Time * All.Time) + All.OmegaLambda;

      hubble_a = All.Hubble * sqrt(hubble_a);
      hubble_a2 = All.Time * All.Time * hubble_a;

      fac_mu = pow(All.Time, 3 * (GAMMA - 1) / 2) / All.Time;

      fac_egy = pow(All.Time, 3 * (GAMMA - 1));

      fac_vsic_fix = hubble_a * pow(All.Time, 3 * GAMMA_MINUS1);

      a3inv = 1 / (All.Time * All.Time * All.Time);
      atime = All.Time;
    }
  else
    hubble_a = hubble_a2 = atime = fac_mu = fac_vsic_fix = a3inv = fac_egy = 1.0;


  /* `NumSphUpdate' gives the number of particles on this processor that want a force update */
  for(n = 0, NumSphUpdate = 0; n < N_gas; n++)
    {
      if(P[n].Ti_endstep == All.Ti_Current)
	NumSphUpdate++;
    }

  numlist = malloc(NTask * sizeof(int) * NTask);
#ifndef NOMPI
  MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, GADGET_WORLD);
#else
    numlist[0] = NumSphUpdate;
#endif
  for(i = 0, ntot = 0; i < NTask; i++)
    ntot += numlist[i];
  free(numlist);


  noffset = malloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
  nbuffer = malloc(sizeof(int) * NTask);
  nsend_local = malloc(sizeof(int) * NTask);
  nsend = malloc(sizeof(int) * NTask * NTask);
  ndonelist = malloc(sizeof(int) * NTask);


  i = 0;			/* first particle for this task */
  ntotleft = ntot;		/* particles left for all tasks together */

  while(ntotleft > 0)
    {
      for(j = 0; j < NTask; j++)
	nsend_local[j] = 0;

      /* do local particles and prepare export list */
      tstart = second();
      for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeHydro - NTask; i++)
	if(P[i].Ti_endstep == All.Ti_Current)
	  {
	    ndone++;

	    for(j = 0; j < NTask; j++)
	      Exportflag[j] = 0;

	    hydro_evaluate(i, 0);

	    for(j = 0; j < NTask; j++)
	      {
		if(Exportflag[j])
		  {
		    for(k = 0; k < 3; k++)
		      {
			HydroDataIn[nexport].Pos[k] = P[i].Pos[k];
			HydroDataIn[nexport].Vel[k] = SphP[i].VelPred[k];
		      }
		    HydroDataIn[nexport].Hsml = SphP[i].Hsml;
		    HydroDataIn[nexport].Mass = P[i].Mass;
		    HydroDataIn[nexport].DhsmlDensityFactor = SphP[i].DhsmlDensityFactor;
		    HydroDataIn[nexport].Density = SphP[i].Density;
		    HydroDataIn[nexport].Pressure = SphP[i].Pressure;
		    HydroDataIn[nexport].Timestep = P[i].Ti_endstep - P[i].Ti_begstep;

		    /* calculation of F1 */
		    soundspeed_i = sqrt(GAMMA * SphP[i].Pressure / SphP[i].Density);
		    HydroDataIn[nexport].F1 = fabs(SphP[i].DivVel) /
		      (fabs(SphP[i].DivVel) + SphP[i].CurlVel +
		       0.0001 * soundspeed_i / SphP[i].Hsml / fac_mu);

		    HydroDataIn[nexport].Index = i;
		    HydroDataIn[nexport].Task = j;
#ifdef MORRIS97VISC
                    HydroDataIn[nexport].Alpha = SphP[i].Alpha;
#endif
		    nexport++;
		    nsend_local[j]++;
		  }
	      }
	  }
      tend = second();
      timecomp += timediff(tstart, tend);

      qsort(HydroDataIn, nexport, sizeof(struct hydrodata_in), hydro_compare_key);

      for(j = 1, noffset[0] = 0; j < NTask; j++)
	noffset[j] = noffset[j - 1] + nsend_local[j - 1];

      tstart = second();
#ifndef NOMPI
      MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, GADGET_WORLD);
#else
    nsend[0] = nsend_local[0];
#endif
      tend = second();
      timeimbalance += timediff(tstart, tend);



      /* now do the particles that need to be exported */

      for(level = 1; level < (1 << PTask); level++)
	{
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeHydro)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
#ifndef NOMPI
		      /* get the particles */
		      MPI_Sendrecv(&HydroDataIn[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct hydrodata_in), MPI_BYTE,
				   recvTask, TAG_HYDRO_A,
				   &HydroDataGet[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct hydrodata_in), MPI_BYTE,
				   recvTask, TAG_HYDRO_A, GADGET_WORLD, &status);
#else
    fprintf(stderr, "NOT SUPPORTED");
    exit(1);
#endif
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);

	  /* now do the imported particles */
	  tstart = second();
	  for(j = 0; j < nbuffer[ThisTask]; j++)
	    hydro_evaluate(j, 1);
	  tend = second();
	  timecomp += timediff(tstart, tend);

	  /* do a block to measure imbalance */
	  tstart = second();
#ifndef NOMPI
      MPI_Barrier(GADGET_WORLD);
#endif
	  tend = second();
	  timeimbalance += timediff(tstart, tend);

	  /* get the result */
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeHydro)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
#ifndef NOMPI
		      /* send the results */
		      MPI_Sendrecv(&HydroDataResult[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct hydrodata_out),
				   MPI_BYTE, recvTask, TAG_HYDRO_B,
				   &HydroDataPartialResult[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct hydrodata_out),
				   MPI_BYTE, recvTask, TAG_HYDRO_B, GADGET_WORLD, &status);
#else
    fprintf(stderr, "NOT SUPPORTED");
    exit(1);
#endif
		      /* add the result to the particles */
		      for(j = 0; j < nsend_local[recvTask]; j++)
			{
			  source = j + noffset[recvTask];
			  place = HydroDataIn[source].Index;

			  for(k = 0; k < 3; k++)
			    SphP[place].HydroAccel[k] += HydroDataPartialResult[source].Acc[k];

			  SphP[place].DtEntropy += HydroDataPartialResult[source].DtEntropy;

			  if(SphP[place].MaxSignalVel < HydroDataPartialResult[source].MaxSignalVel)
			    SphP[place].MaxSignalVel = HydroDataPartialResult[source].MaxSignalVel;
			}
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);

	  level = ngrp - 1;
	}

#ifndef NOMPI
      MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, GADGET_WORLD);
#else
    ndonelist[0] = ndone;
#endif
      for(j = 0; j < NTask; j++)
	ntotleft -= ndonelist[j];
    }

  free(ndonelist);
  free(nsend);
  free(nsend_local);
  free(nbuffer);
  free(noffset);



  /* do final operations on results */
  tstart = second();

  for(i = 0; i < N_gas; i++)
    if(P[i].Ti_endstep == All.Ti_Current)
      {
	SphP[i].DtEntropy *= GAMMA_MINUS1 / (hubble_a2 * pow(SphP[i].Density, GAMMA_MINUS1));
#ifdef SPH_BND_PARTICLES
	if(P[i].ID == 0)
	  {
	    SphP[i].DtEntropy = 0;
	    for(k = 0; k < 3; k++)
	      SphP[i].HydroAccel[k] = 0;
	  }
#endif
      }

  tend = second();
  timecomp += timediff(tstart, tend);

  /* collect some timing information */

#ifndef NOMPI
  MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, GADGET_WORLD);
  MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, GADGET_WORLD);
  MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, GADGET_WORLD);
#else
    sumt = timecomp;
    sumcomm = timecommsumm;
    sumimbalance = timeimbalance;
#endif
  if(ThisTask == 0)
    {
      All.CPU_HydCompWalk += sumt / NTask;
      All.CPU_HydCommSumm += sumcomm / NTask;
      All.CPU_HydImbalance += sumimbalance / NTask;
    }
}
Exemplo n.º 2
0
/*! This function is the driver routine for the calculation of hydrodynamical
 *  force and rate of change of entropy due to shock heating for all active
 *  particles .
 */
void hydro_force(void)
{
  TimerBeg(90);
  long long ntot, ntotleft;
  int i, j, k, n, ngrp, maxfill, source, ndone;
  int *nbuffer, *noffset, *nsend_local, *nsend, *numlist, *ndonelist;
  int level, sendTask, recvTask, nexport, place;
  double soundspeed_i;
  double tstart, tend, sumt, sumcomm;
  double timecomp = 0, timecommsumm = 0, timeimbalance = 0, sumimbalance;
  MPI_Status status;


#ifdef PERIODIC
  boxSize = All.BoxSize;
  boxHalf = 0.5 * All.BoxSize;
#ifdef LONG_X
  boxHalf_X = boxHalf * LONG_X;
  boxSize_X = boxSize * LONG_X;
#endif
#ifdef LONG_Y
  boxHalf_Y = boxHalf * LONG_Y;
  boxSize_Y = boxSize * LONG_Y;
#endif
#ifdef LONG_Z
  boxHalf_Z = boxHalf * LONG_Z;
  boxSize_Z = boxSize * LONG_Z;
#endif
#endif

  if(All.ComovingIntegrationOn)
    {
      /* Factors for comoving integration of hydro */
      hubble_a = All.Omega0 / (All.Time * All.Time * All.Time)
	+ (1 - All.Omega0 - All.OmegaLambda) / (All.Time * All.Time) + All.OmegaLambda;

      hubble_a = All.Hubble * sqrt(hubble_a);
      hubble_a2 = All.Time * All.Time * hubble_a;

      fac_mu = pow(All.Time, 3 * (GAMMA - 1) / 2) / All.Time;

      fac_egy = pow(All.Time, 3 * (GAMMA - 1));

      fac_vsic_fix = hubble_a * pow(All.Time, 3 * GAMMA_MINUS1);

      a3inv = 1 / (All.Time * All.Time * All.Time);
      atime = All.Time;
    }
  else
    hubble_a = hubble_a2 = atime = fac_mu = fac_vsic_fix = a3inv = fac_egy = 1.0;


  /* `NumSphUpdate' gives the number of particles on this processor that want a force update */
  for(n = 0, NumSphUpdate = 0; n < N_gas; n++)
    {
      if(P[n].Ti_endstep == All.Ti_Current)
	NumSphUpdate++;
    }

  numlist = malloc(NTask * sizeof(int) * NTask);
  MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);
  for(i = 0, ntot = 0; i < NTask; i++)
    ntot += numlist[i];
  free(numlist);


  noffset = malloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
  nbuffer = malloc(sizeof(int) * NTask);
  nsend_local = malloc(sizeof(int) * NTask);
  nsend = malloc(sizeof(int) * NTask * NTask);
  ndonelist = malloc(sizeof(int) * NTask);

  i = 0;			/* first particle for this task */
  ntotleft = ntot;		/* particles left for all tasks together */

	///////////////// GX //////////////////////
	FUN_MESSAGE(2,"hydro_force()");
	#ifdef CUDA_GX_NO_SPH_SUPPORT
		int oldcudamode=s_gx.cudamode;
		s_gx.cudamode=0;
	#endif
	double starttime,subtime=-1,cpytime=-1;
	const int Np=PrintInfoInitialize(N_gas,s_gx.cudamode,1);
	int iter=0;
	///////////////// GX //////////////////////

  while(ntotleft > 0)
    {
	///////////////// GX //////////////////////
	if (s_gx.cudamode!=0 && i!=0) ERROR("cuda mode does not support iterations in hydro calc,  try to increasing the 'BufferSize' in the parameter file to surcomevent this problem");
	iter++;
	///////////////// GX //////////////////////

	for(j = 0; j < NTask; j++)
	nsend_local[j] = 0;

      /* do local particles and prepare export list */
	TimerBeg(91);
	TimerBeg(93);
	starttime=GetTime();

	tstart = second();
if (s_gx.cudamode==0 || (Np!=N_gas || Np<MIN_SPH_PARTICLES_FOR_GPU_GX)) {
//if (s_gx.cudamode==0 || Np<MIN_SPH_PARTICLES_FOR_GPU_GX) {

		#ifdef CUDA_GX_CHUNCK_MANAGER_SPH
			ReLaunchChunkManager();
		#endif

		for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeHydro - NTask; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		{
			ndone++;

			for(j = 0; j < NTask; j++)
				Exportflag[j] = 0;

			hydro_evaluate(i, 0);

			TimerUpdateCounter(91,1);

			for(j = 0; j < NTask; j++)
			{
				if(Exportflag[j])
				{
					for(k = 0; k < 3; k++)
					{
						HydroDataIn[nexport].Pos[k] = P[i].Pos[k];
						HydroDataIn[nexport].Vel[k] = SphP[i].VelPred[k];
					}
					HydroDataIn[nexport].Hsml = SphP[i].Hsml;
					HydroDataIn[nexport].Mass = P[i].Mass;
					HydroDataIn[nexport].DhsmlDensityFactor = SphP[i].DhsmlDensityFactor;
					HydroDataIn[nexport].Density = SphP[i].Density;
					HydroDataIn[nexport].Pressure = SphP[i].Pressure;
					HydroDataIn[nexport].Timestep = P[i].Ti_endstep - P[i].Ti_begstep;

					/* calculation of F1 */
					soundspeed_i = sqrt(GAMMA * SphP[i].Pressure / SphP[i].Density);
					HydroDataIn[nexport].F1 = fabs(SphP[i].DivVel) /
						(fabs(SphP[i].DivVel) + SphP[i].CurlVel +
						0.0001 * soundspeed_i / SphP[i].Hsml / fac_mu);

					HydroDataIn[nexport].Index = i;
					HydroDataIn[nexport].Task = j;
					nexport++;
					nsend_local[j]++;
				}
			}
		}
		#ifdef CUDA_GX_CHUNCK_MANAGER_SPH
			ManageChuncks(1);
		#endif
	} else {
		///////////////// GX //////////////////////

		cpytime=GetTime();

		ASSERT_GX(s_gx.cudamode>0);
		if (i!=0) ERROR("cuda mode does not support iterations in hydro calc, try to increasing the 'BufferSize' in the parameter file to surcomevent this problem");

		const int Np2=InitializeHydraCalculation_gx(NumPart,P,SphP,N_gas,hubble_a2, fac_mu, fac_vsic_fix
			#ifdef PERIODIC
				,boxSize,boxHalf
			#endif
		);

		if (Np2==0) WARNING("no sph particles participate in this timestep");
		ASSERT_GX( Np2==Np );

		cpytime = GetTime()-cpytime;
		subtime=GetTime();

		hydro_evaluate_range_cuda_gx(0,N_gas,s_gx,p_gx,h_gx);
		subtime=GetTime()-subtime;

		for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeHydro - NTask; i++)
		if(P[i].Ti_endstep == All.Ti_Current)
		{
			ndone++;

			for(j = 0; j < NTask; j++)
				Exportflag[j] = 0;

			ASSERT_GX( P[i].Type==0 );
			//hydro_evaluate_cuda_gx(i, 0,&s_gx,&p_gx);

			TimerUpdateCounter(91,1);

			ASSERT_GX(i<s_gx.sz_result_hydro);
			const struct result_hydro_gx r=s_gx.result_hydro[i];
			ASSERT_GX( isResultHydraDataOK(r,__FILE__,__LINE__) );

			for(k = 0; k < 3; k++) SphP[i].HydroAccel[k] = r.Acc[k];
			SphP[i].DtEntropy = r.DtEntropy;
			SphP[i].MaxSignalVel = r.MaxSignalVel;

			if (s_gx.NTask>1){
				for(j = 0; j < NTask; j++)
				{
					const char export_this=GetExportflag_gx(&s_gx,i,NTask,j);
					if(export_this)
					{
						for(k = 0; k < 3; k++)
						{
							HydroDataIn[nexport].Pos[k] = P[i].Pos[k];
							HydroDataIn[nexport].Vel[k] = SphP[i].VelPred[k];
						}
						HydroDataIn[nexport].Hsml = SphP[i].Hsml;
						HydroDataIn[nexport].Mass = P[i].Mass;
						HydroDataIn[nexport].DhsmlDensityFactor = SphP[i].DhsmlDensityFactor;
						HydroDataIn[nexport].Density = SphP[i].Density;
						HydroDataIn[nexport].Pressure = SphP[i].Pressure;
						HydroDataIn[nexport].Timestep = P[i].Ti_endstep - P[i].Ti_begstep;

						// calculation of F1
						soundspeed_i = sqrt(GAMMA * SphP[i].Pressure / SphP[i].Density);
						HydroDataIn[nexport].F1 = fabs(SphP[i].DivVel) /
							(fabs(SphP[i].DivVel) + SphP[i].CurlVel +
							0.0001 * soundspeed_i / SphP[i].Hsml / fac_mu);

						HydroDataIn[nexport].Index = i;
						HydroDataIn[nexport].Task = j;
						nexport++;
						nsend_local[j]++;
					}
				}
			}
		}
		///////////////// GX //////////////////////
	}
	TimerEnd(93);

      tend = second();
      timecomp += timediff(tstart, tend);

	///////////////// GX //////////////////////
	PrintInfoFinalize(s_gx,ndone,Np,starttime,cpytime,subtime,1,iter,-1,0,0,nexport,0,0,0);
	subtime=-1;
	///////////////// GX //////////////////////

      qsort(HydroDataIn, nexport, sizeof(struct hydrodata_in), hydro_compare_key);

      for(j = 1, noffset[0] = 0; j < NTask; j++)
	noffset[j] = noffset[j - 1] + nsend_local[j - 1];

      tstart = second();

      MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD);

      tend = second();
      timeimbalance += timediff(tstart, tend);

	TimerEnd(91);
	TimerBeg(92);

      /* now do the particles that need to be exported */

      for(level = 1; level < (1 << PTask); level++)
	{
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeHydro)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&HydroDataIn[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct hydrodata_in), MPI_BYTE,
				   recvTask, TAG_HYDRO_A,
				   &HydroDataGet[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct hydrodata_in), MPI_BYTE,
				   recvTask, TAG_HYDRO_A, MPI_COMM_WORLD, &status);
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);


	  /* now do the imported particles */
	  tstart = second();

	///////////////// GX //////////////////////
	// Do exported particles on the CPU/GPU
	TimerBeg(94);
	{
		AssertsOnhasGadgetDataBeenModified_gx(1,1,0);

		#if CUDA_DEBUG_GX>1
			MESSAGE("INFO: DistRMSGrav=%g",DistRMSGravdata(nbuffer[ThisTask],GravDataGet));
		#endif

		starttime=GetTime();
		const int N=nbuffer[ThisTask];

		if (N>0){
// YYY NOTE: disable GPU exportmode for now!!!
			if (1 || s_gx.cudamode==0 || N<MIN_SPH_PARTICLES_FOR_GPU_GX || Np<MIN_SPH_PARTICLES_FOR_GPU_GX) {
				for(j = 0; j < nbuffer[ThisTask]; j++)
					hydro_evaluate(j, 1);
			} else {
				cpytime=GetTime();

				InitializeHydraExportCalculation_gx(N,HydroDataGet);

				subtime=GetTime();
				hydro_evaluate_range_cuda_gx(1,N,s_gx,p_gx,h_gx);

				subtime=GetTime()-subtime;
				FinalizeHydraExportCalculation_gx(N);

				cpytime=GetTime()-cpytime-subtime;
			}

			PrintInfoFinalize(s_gx,0,N,starttime,cpytime,subtime,3,iter,level,0,0,nexport,0,0,0);
			subtime=-1;
		}
	}
	TimerEnd(94);
	///////////////// GX //////////////////////

	  tend = second();
	  timecomp += timediff(tstart, tend);


	  /* do a block to measure imbalance */
	  TimerBeg(95);
	  tstart = second();
	  MPI_Barrier(MPI_COMM_WORLD);
	  tend = second();
	  timeimbalance += timediff(tstart, tend);
	  TimerEnd(95);

	  /* get the result */
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeHydro)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&HydroDataResult[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct hydrodata_out),
				   MPI_BYTE, recvTask, TAG_HYDRO_B,
				   &HydroDataPartialResult[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct hydrodata_out),
				   MPI_BYTE, recvTask, TAG_HYDRO_B, MPI_COMM_WORLD, &status);

		      /* add the result to the particles */
		      for(j = 0; j < nsend_local[recvTask]; j++)
			{
			  source = j + noffset[recvTask];
			  place = HydroDataIn[source].Index;

			  for(k = 0; k < 3; k++)
			    SphP[place].HydroAccel[k] += HydroDataPartialResult[source].Acc[k];

			  SphP[place].DtEntropy += HydroDataPartialResult[source].DtEntropy;

			  if(SphP[place].MaxSignalVel < HydroDataPartialResult[source].MaxSignalVel)
			    SphP[place].MaxSignalVel = HydroDataPartialResult[source].MaxSignalVel;
			}
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);

	  level = ngrp - 1;
	}
	TimerEnd(92);

      MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD);
      for(j = 0; j < NTask; j++)
	ntotleft -= ndonelist[j];
    }


  free(ndonelist);
  free(nsend);
  free(nsend_local);
  free(nbuffer);
  free(noffset);

  /* do final operations on results */
  tstart = second();

  for(i = 0; i < N_gas; i++)
    if(P[i].Ti_endstep == All.Ti_Current)
      {
	SphP[i].DtEntropy *= GAMMA_MINUS1 / (hubble_a2 * pow(SphP[i].Density, GAMMA_MINUS1));
#ifdef SPH_BND_PARTICLES
	if(P[i].ID == 0)
	  {
	    SphP[i].DtEntropy = 0;
	    for(k = 0; k < 3; k++)
	      SphP[i].HydroAccel[k] = 0;
	  }
#endif
      }

  tend = second();
  timecomp += timediff(tstart, tend);

  /* collect some timing information */

  MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

  if(ThisTask == 0)
    {
      All.CPU_HydCompWalk += sumt / NTask;
      All.CPU_HydCommSumm += sumcomm / NTask;
      All.CPU_HydImbalance += sumimbalance / NTask;
    }

   TimerEnd(90);

	#ifdef RESULT_FILE_DUMP_GX
		static FILE* resultfile=NULL;

		if (resultfile==NULL) {
			char filename[256];
			sprintf(filename,"resultfile.%d.%d.txt",s_gx.cudamode,ThisTask);
			resultfile=fopen(filename,"w");
		}
		else{
			MESSAGE("Dumping result...");
			static int MM=0;
			int j;
			fprintf(resultfile,"Dumping result...N_gas=%d\n",N_gas);
			for(j=0;j<N_gas;++j)
			if(P[j].Ti_endstep == All.Ti_Current){
				static int NN=0;
				const int target=j;
				fprintf(resultfile,"m=0, NN=%6d, t=%6d, e=%.4g, v=%.4g, acc={%.2g,%.2g,%.2g}\n",NN++,target,SphP[target].DtEntropy,SphP[target].MaxSignalVel,SphP[target].HydroAccel[0],SphP[target].HydroAccel[1],SphP[target].HydroAccel[2]);
				fflush(resultfile);
			}
			if (++MM>2) exit(-42);
		}
	#endif

	#ifdef CUDA_GX_NO_SPH_SUPPORT
		s_gx.cudamode=oldcudamode;
	#endif

	//MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f  -  %5.1f, %5.1f, %5.1f, %5.1f %c sph timers d 90,93,94,95,net",TimerGet(90),TimerGet(93),TimerGet(94),TimerGet(95),TimerGet(90)-TimerGet(93)-TimerGet(94),100.0*TimerGet(93)/TimerGet(90),100.0*TimerGet(94)/TimerGet(90),100.0*TimerGet(95)/TimerGet(90),100.0*(TimerGet(90)-TimerGet(93)-TimerGet(94))/TimerGet(90),'%');
	//MESSAGE("%6.2f, %6.2f, %6.2f, %6.2f, %6.2f  -  %5.1f, %5.1f, %5.1f, %5.1f %c sph timers a 90,93,94,95,net",TimerGetAccumulated(90),TimerGetAccumulated(93),TimerGetAccumulated(94),TimerGetAccumulated(95),TimerGetAccumulated(90)-TimerGetAccumulated(93)-TimerGetAccumulated(94),100.0*TimerGetAccumulated(93)/TimerGetAccumulated(90),100.0*TimerGetAccumulated(94)/TimerGetAccumulated(90),100.0*TimerGetAccumulated(95)/TimerGetAccumulated(90),100.0*(TimerGetAccumulated(90)-TimerGetAccumulated(93)-TimerGetAccumulated(94))/TimerGetAccumulated(90),'%');
}