Beispiel #1
0
/* copy files to a partner node */
static int scr_reddesc_apply_partner(
  scr_filemap* map,
  const scr_reddesc* c,
  int id)
{
  int rc = SCR_SUCCESS;

  /* get pointer to partner state structure */
  scr_reddesc_partner* state = (scr_reddesc_partner*) c->copy_state;

  /* get a list of our files */
  int numfiles = 0;
  char** files = NULL;
  scr_filemap_list_files(map, id, scr_my_rank_world, &numfiles, &files);

  /* first, determine how many files we'll be sending and receiving
   * with our partners */
  MPI_Status status;
  int send_num = numfiles;
  int recv_num = 0;
  MPI_Sendrecv(
    &send_num, 1, MPI_INT, state->rhs_rank, 0,
    &recv_num, 1, MPI_INT, state->lhs_rank, 0,
    c->comm, &status
  );

  /* record how many files our partner will send */
  scr_filemap_set_expected_files(map, id, state->lhs_rank_world, recv_num);

  /* remember which node our partner is on (needed for scavenge) */
  scr_hash* flushdesc = scr_hash_new();
  scr_filemap_get_flushdesc(map, id, state->lhs_rank_world, flushdesc);
  scr_hash_util_set_int(flushdesc, SCR_SCAVENGE_KEY_PRESERVE,  scr_preserve_directories);
  scr_hash_util_set_int(flushdesc, SCR_SCAVENGE_KEY_CONTAINER, scr_use_containers);
  scr_hash_util_set_str(flushdesc, SCR_SCAVENGE_KEY_PARTNER,   state->lhs_hostname);
  scr_filemap_set_flushdesc(map, id, state->lhs_rank_world, flushdesc);
  scr_hash_delete(&flushdesc);

  /* record partner's redundancy descriptor hash */
  scr_hash* lhs_desc_hash = scr_hash_new();
  scr_hash* my_desc_hash  = scr_hash_new();
  scr_reddesc_store_to_hash(c, my_desc_hash);
  scr_hash_sendrecv(my_desc_hash, state->rhs_rank, lhs_desc_hash, state->lhs_rank, c->comm);
  scr_filemap_set_desc(map, id, state->lhs_rank_world, lhs_desc_hash);
  scr_hash_delete(&my_desc_hash);
  scr_hash_delete(&lhs_desc_hash);

  /* store this info in our filemap before we receive any files */
  scr_filemap_write(scr_map_file, map);

  /* define directory to receive partner file in */
  char* dir = scr_cache_dir_get(c, id);

  /* for each potential file, step through a call to swap */
  while (send_num > 0 || recv_num > 0) {
    /* assume we won't send or receive in this step */
    int send_rank = MPI_PROC_NULL;
    int recv_rank = MPI_PROC_NULL;

    /* if we have a file left to send,
     * get the filename and destination rank */
    char* file = NULL;
    if (send_num > 0) {
      int i = numfiles - send_num;
      file = files[i];
      send_rank = state->rhs_rank;
      send_num--;
    }

    /* if we have a file left to receive, get the rank */
    if (recv_num > 0) {
      recv_rank = state->lhs_rank;
      recv_num--;
    }

    /* exhange file names with partners */
    char file_partner[SCR_MAX_FILENAME];
    scr_swap_file_names(file, send_rank, file_partner, sizeof(file_partner), recv_rank, dir, c->comm);

    /* if we'll receive a file, record the name of our partner's
     * file in the filemap */
    if (recv_rank != MPI_PROC_NULL) {
      scr_filemap_add_file(map, id, state->lhs_rank_world, file_partner);
      scr_filemap_write(scr_map_file, map);
    }

    /* get meta data of file we're sending */
    scr_meta* send_meta = scr_meta_new();
    scr_filemap_get_meta(map, id, scr_my_rank_world, file, send_meta);

    /* exhange files with partners */
    scr_meta* recv_meta = scr_meta_new();
    if (scr_swap_files(COPY_FILES, file, send_meta, send_rank, file_partner, recv_meta, recv_rank, c->comm) != SCR_SUCCESS) {
      rc = SCR_FAILURE;
    }
    scr_filemap_set_meta(map, id, state->lhs_rank_world, file_partner, recv_meta);

    /* free meta data for these files */
    scr_meta_delete(&recv_meta);
    scr_meta_delete(&send_meta);
  }

  /* free cache directory string */
  scr_free(&dir);

  /* write out the updated filemap */
  scr_filemap_write(scr_map_file, map);

  /* free our list of files */
  scr_free(&files);

  return rc;
}
Beispiel #2
0
/* ********************************************************************* */
int AL_Exchange(void *vbuf, int sz_ptr)
/*!
 * Fill the ghost boundaries
 *
 * \param [in] vbuf   pointer to buffer
 * \param [in] sz_ptr integer pointer to the distributed array descriptor
 *********************************************************************** */
{
  char *buf;
  register int nd;
  int myrank, nproc;
  int ndim, gp, nleft, nright, tag1, tag2;
  int sendb, recvb;
  MPI_Datatype itype;
  MPI_Comm comm;
  MPI_Status status;
  SZ *s;

  buf = (char *) vbuf;

  /* DIAGNOSTICS
    Check that sz_ptr points to an allocated SZ
  */
  if( stack_ptr[sz_ptr] == AL_STACK_FREE){
    printf("AL_Decompose: wrong SZ pointer\n");
  }

  s = sz_stack[sz_ptr];

  myrank = s->rank;
  nproc = s->size;
  comm = s->comm;
  ndim = s->ndim;

  for(nd=0;nd<ndim;nd++){
    gp = s->bg[nd];
    /* If gp=0, do nothing */
    if( gp > 0 ){
      nleft = s->left[nd];
      nright = s->right[nd];
      itype = s->type_rl[nd];
      tag1 = s->tag1[nd];
      sendb = s->sendb1[nd];
      recvb = s->recvb1[nd];

      MPI_Sendrecv(&buf[sendb], 1, itype, nleft, tag1,
		   &buf[recvb], 1, itype, nright,tag1,
		   comm, &status);

      nleft = s->left[nd];
      nright = s->right[nd];
      itype = s->type_lr[nd];
      tag2 = s->tag2[nd];

      sendb = s->sendb2[nd];
      recvb = s->recvb2[nd];

      MPI_Sendrecv(&buf[sendb], 1, itype, nright, tag2,
		   &buf[recvb], 1, itype, nleft,tag2,
		   comm, &status);
    }
  }

  /* DIAGNOSTICS */
#ifdef DEBUG
  if(myrank==0) printf("AL_Exchange: filled ghost regions\n");
#endif

  return (int) AL_SUCCESS;
}
Beispiel #3
0
/* This function computes the gravitational potential for ALL the particles.
 * It expects that the particles are predicted to the current time.
 */
void compute_potential(void)
{
  int i;

#ifndef NOGRAVITY
  long long ntot, ntotleft;
  int j, k, level, sendTask, recvTask;
  int ndone;
  int maxfill, ngrp, place, nexport;
  int *nsend, *noffset, *nsend_local, *nbuffer, *ndonelist, *numlist;
  double fac;
  double t0, t1, tstart, tend;
  MPI_Status status;
  double r2;

  t0 = second();

  if(All.ComovingIntegrationOn)
    set_softenings();

  if(ThisTask == 0)
    {
      printf("Start computation of potential for all particles...\n");
      fflush(stdout);
    }

#ifdef ISOTHERM
  for(i = 0; i < NumPart; i++)
    {
      for(k = 0, r2 = 0; k < 3; k++)
	r2 += P[i].Pos[k] * P[i].Pos[k];

      P[i].Potential = -2 * ISOTHERM * ISOTHERM * (1 + log(ISOTHERM / sqrt(r2)));
    }
  return;
#endif


  tstart = second();
  if(TreeReconstructFlag)
    {
      if(ThisTask == 0)
	printf("Tree construction.\n");
#if defined(SFR) || defined(BLACK_HOLES)
      rearrange_particle_sequence();
#endif
      force_treebuild();

      TreeReconstructFlag = 0;

      if(ThisTask == 0)
	printf("Tree construction done.\n");
    }
  tend = second();
  All.CPU_TreeConstruction += timediff(tstart, tend);

  numlist = malloc(NTask * sizeof(int) * NTask);
  MPI_Allgather(&NumPart, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);
  for(i = 0, ntot = 0; i < NTask; i++)
    ntot += numlist[i];
  free(numlist);

  noffset = malloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
  nbuffer = malloc(sizeof(int) * NTask);
  nsend_local = malloc(sizeof(int) * NTask);
  nsend = malloc(sizeof(int) * NTask * NTask);
  ndonelist = malloc(sizeof(int) * NTask);

  i = 0;			/* beginn with this index */
  ntotleft = ntot;		/* particles left for all tasks together */

  while(ntotleft > 0)
    {
      for(j = 0; j < NTask; j++)
	nsend_local[j] = 0;

      /* do local particles and prepare export list */
      for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++)
	{
	  ndone++;

	  for(j = 0; j < NTask; j++)
	    Exportflag[j] = 0;

#ifndef PMGRID
	  force_treeevaluate_potential(i, 0);
#else
	  force_treeevaluate_potential_shortrange(i, 0);
#endif

	  for(j = 0; j < NTask; j++)
	    {
	      if(Exportflag[j])
		{
		  for(k = 0; k < 3; k++)
		    GravDataGet[nexport].u.Pos[k] = P[i].Pos[k];
#ifdef UNEQUALSOFTENINGS
		  GravDataGet[nexport].v.Type = P[i].Type;
#endif
		  GravDataGet[nexport].w.OldAcc = P[i].OldAcc;

		  GravDataIndexTable[nexport].Task = j;
		  GravDataIndexTable[nexport].Index = i;
		  GravDataIndexTable[nexport].SortIndex = nexport;

		  nexport++;
		  nsend_local[j]++;
		}
	    }
	}

      qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key);

      for(j = 0; j < nexport; j++)
	GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex];

      for(j = 1, noffset[0] = 0; j < NTask; j++)
	noffset[j] = noffset[j - 1] + nsend_local[j - 1];

      MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD);

      /* now do the particles that need to be exported */

      for(level = 1; level < (1 << PTask); level++)
	{
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeForce)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&GravDataIn[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE,
				   recvTask, TAG_POTENTIAL_A,
				   &GravDataGet[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE,
				   recvTask, TAG_POTENTIAL_A, MPI_COMM_WORLD, &status);
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }

	  for(j = 0; j < nbuffer[ThisTask]; j++)
	    {
#ifndef PMGRID
	      force_treeevaluate_potential(j, 1);
#else
	      force_treeevaluate_potential_shortrange(j, 1);
#endif
	    }


	  /* get the result */
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeForce)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in),
				   MPI_BYTE, recvTask, TAG_POTENTIAL_B,
				   &GravDataOut[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct gravdata_in),
				   MPI_BYTE, recvTask, TAG_POTENTIAL_B, MPI_COMM_WORLD, &status);

		      /* add the result to the particles */
		      for(j = 0; j < nsend_local[recvTask]; j++)
			{
			  place = GravDataIndexTable[noffset[recvTask] + j].Index;

			  P[place].Potential += GravDataOut[j + noffset[recvTask]].v.Potential;
			}
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }

	  level = ngrp - 1;
	}

      MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD);
      for(j = 0; j < NTask; j++)
	ntotleft -= ndonelist[j];
    }

  free(ndonelist);
  free(nsend);
  free(nsend_local);
  free(nbuffer);
  free(noffset);


  /* add correction to exclude self-potential */

  for(i = 0; i < NumPart; i++)
    {
      /* remove self-potential */
      P[i].Potential += P[i].Mass / All.SofteningTable[P[i].Type];

      if(All.ComovingIntegrationOn)
	if(All.PeriodicBoundariesOn)
	  P[i].Potential -= 2.8372975 * pow(P[i].Mass, 2.0 / 3) *
	    pow(All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G), 1.0 / 3);
    }


  /* multiply with the gravitational constant */

  for(i = 0; i < NumPart; i++)
    P[i].Potential *= All.G;


#ifdef PMGRID

#ifdef PERIODIC
  pmpotential_periodic();
#ifdef PLACEHIGHRESREGION
  pmpotential_nonperiodic(1);
#endif
#else
  pmpotential_nonperiodic(0);
#ifdef PLACEHIGHRESREGION
  pmpotential_nonperiodic(1);
#endif
#endif

#endif



  if(All.ComovingIntegrationOn)
    {
#ifndef PERIODIC
      fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble;

      for(i = 0; i < NumPart; i++)
	{
	  for(k = 0, r2 = 0; k < 3; k++)
	    r2 += P[i].Pos[k] * P[i].Pos[k];

	  P[i].Potential += fac * r2;
	}
#endif
    }
  else
    {
      fac = -0.5 * All.OmegaLambda * All.Hubble * All.Hubble;
      if(fac != 0)
	{
	  for(i = 0; i < NumPart; i++)
	    {
	      for(k = 0, r2 = 0; k < 3; k++)
		r2 += P[i].Pos[k] * P[i].Pos[k];

	      P[i].Potential += fac * r2;
	    }
	}
    }


  if(ThisTask == 0)
    {
      printf("potential done.\n");
      fflush(stdout);
    }

  t1 = second();

  All.CPU_Potential += timediff(t0, t1);

#else
  for(i = 0; i < NumPart; i++)
    P[i].Potential = 0;
#endif
}
Beispiel #4
0
/*! Calculates the long-range periodic force given the particle positions
 *  using the PM method.  The force is Gaussian filtered with Asmth, given in
 *  mesh-cell units. We carry out a CIC charge assignment, and compute the
 *  potenial by Fourier transform methods. The potential is finite differenced
 *  using a 4-point finite differencing formula, and the forces are
 *  interpolated tri-linearly to the particle positions. The CIC kernel is
 *  deconvolved. Note that the particle distribution is not in the slab
 *  decomposition that is used for the FFT. Instead, overlapping patches
 *  between local domains and FFT slabs are communicated as needed.
 */
void pmforce_periodic(void)
{
  double k2, kx, ky, kz, smth;
  double dx, dy, dz;
  double fx, fy, fz, ff;
  double asmth2, fac, acc_dim;
  int i, j, slab, level, sendTask, recvTask;
  int x, y, z, xl, yl, zl, xr, yr, zr, xll, yll, zll, xrr, yrr, zrr, ip, dim;
  int slab_x, slab_y, slab_z;
  int slab_xx, slab_yy, slab_zz;
  int meshmin[3], meshmax[3], sendmin, sendmax, recvmin, recvmax;
  int rep, ncont, cont_sendmin[2], cont_sendmax[2], cont_recvmin[2], cont_recvmax[2];
  int dimx, dimy, dimz, recv_dimx, recv_dimy, recv_dimz;
  MPI_Status status;


  if(ThisTask == 0)
    {
      printf("Starting periodic PM calculation.\n");
      fflush(stdout);
    }

  #ifdef FFTW3
  if(fftw_plan_exists)
  {
	  /* macro defined in callgrind.h */
	  // CALLGRIND_START_INSTRUMENTATION;
  }
  #else
  // CALLGRIND_START_INSTRUMENTATION;
  #endif

  force_treefree();


  asmth2 = (2 * M_PI) * All.Asmth[0] / All.BoxSize;
  asmth2 *= asmth2;

  fac = All.G / (M_PI * All.BoxSize);	/* to get potential */
  fac *= 1 / (2 * All.BoxSize / PMGRID);	/* for finite differencing */

  /* first, establish the extension of the local patch in the PMGRID  */

  for(j = 0; j < 3; j++)
    {
      meshmin[j] = PMGRID;
      meshmax[j] = 0;
    }

  for(i = 0; i < NumPart; i++)
    {
      for(j = 0; j < 3; j++)
	{
	  slab = to_slab_fac * P[i].Pos[j];
	  if(slab >= PMGRID)
	    slab = PMGRID - 1;

	  if(slab < meshmin[j])
	    meshmin[j] = slab;

	  if(slab > meshmax[j])
	    meshmax[j] = slab;
	}
    }

  MPI_Allgather(meshmin, 3, MPI_INT, meshmin_list, 3, MPI_INT, MPI_COMM_WORLD);
  MPI_Allgather(meshmax, 3, MPI_INT, meshmax_list, 3, MPI_INT, MPI_COMM_WORLD);

  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  pm_init_periodic_allocate((dimx + 4) * (dimy + 4) * (dimz + 4));

  #ifdef FFTW3
  if(!fftw_plan_exists)
  {
  	/* Create plan for in-place r2c DFT */
  	fft_forward_plan = fftw_mpi_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid, fft_of_rhogrid,
  												MPI_COMM_WORLD, FFTW_PATIENT | FFTW_MPI_TRANSPOSED_OUT);
  	fft_inverse_plan = fftw_mpi_plan_dft_c2r_3d(PMGRID, PMGRID, PMGRID, fft_of_rhogrid, rhogrid,
  												MPI_COMM_WORLD, FFTW_PATIENT | FFTW_MPI_TRANSPOSED_IN);
  	fftw_plan_exists = true;	// use C99 bool type
  	
  	if(ThisTask == 0)
  		printf("Created new FFTW3 plan.\n");
  } else {
  	/* do nothing, the plan has already been created by previous call to this function */
  }
  #endif

	/* For FFTW3, there is a different convention for fftsize for real-to-complex transforms, i.e.
		fftsize is the size of the complex data (number of complex values), NOT the size of the real data!
		We attempt to take care of this by defining fftsize to be fftsize_real when using FFTW3.  */

  for(i = 0; i < dimx * dimy * dimz; i++)
    workspace[i] = 0;

  for(i = 0; i < NumPart; i++)
    {
      slab_x = to_slab_fac * P[i].Pos[0];
      if(slab_x >= PMGRID)
	slab_x = PMGRID - 1;
      dx = to_slab_fac * P[i].Pos[0] - slab_x;
      slab_x -= meshmin[0];
      slab_xx = slab_x + 1;

      slab_y = to_slab_fac * P[i].Pos[1];
      if(slab_y >= PMGRID)
	slab_y = PMGRID - 1;
      dy = to_slab_fac * P[i].Pos[1] - slab_y;
      slab_y -= meshmin[1];
      slab_yy = slab_y + 1;

      slab_z = to_slab_fac * P[i].Pos[2];
      if(slab_z >= PMGRID)
	slab_z = PMGRID - 1;
      dz = to_slab_fac * P[i].Pos[2] - slab_z;
      slab_z -= meshmin[2];
      slab_zz = slab_z + 1;

      workspace[(slab_x * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * dy * (1.0 - dz);
      workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * dz;
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * dy * dz;

      workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (dx) * dy * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (dx) * (1.0 - dy) * dz;
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (dx) * dy * dz;
    }


  for(i = 0; i < fftsize; i++)	/* clear local density field */
    rhogrid[i] = 0;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;
      if(recvTask < NTask)
	{
	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -1;
	  for(slab_x = meshmin[0]; slab_x < meshmax[0] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == recvTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -1)
	    sendmin = 0;

	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -1;
	  for(slab_x = meshmin_list[3 * recvTask]; slab_x < meshmax_list[3 * recvTask] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == sendTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -1)
	    recvmin = 0;


	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 2;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 2;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 2;

	      if(level > 0)
		{
		  MPI_Sendrecv(workspace + (sendmin - meshmin[0]) * dimy * dimz,
			       (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask,
			       TAG_PERIODIC_A, forcegrid,
			       (recvmax - recvmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE,
			       recvTask, TAG_PERIODIC_A, MPI_COMM_WORLD, &status);
		}
	      else
		{
		  memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz,
			 (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real));
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  slab_xx = (slab_x % PMGRID) - first_slab_of_task[ThisTask];

		  if(slab_xx >= 0 && slab_xx < slabs_per_task[ThisTask])
		    {
		      for(slab_y = meshmin_list[3 * recvTask + 1];
			  slab_y <= meshmax_list[3 * recvTask + 1] + 1; slab_y++)
			{
			  slab_yy = slab_y;
			  if(slab_yy >= PMGRID)
			    slab_yy -= PMGRID;

			  for(slab_z = meshmin_list[3 * recvTask + 2];
			      slab_z <= meshmax_list[3 * recvTask + 2] + 1; slab_z++)
			    {
			      slab_zz = slab_z;
			      if(slab_zz >= PMGRID)
				slab_zz -= PMGRID;

			      rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] +=
				forcegrid[((slab_x - recvmin) * recv_dimy +
					   (slab_y - meshmin_list[3 * recvTask + 1])) * recv_dimz +
					  (slab_z - meshmin_list[3 * recvTask + 2])];
			    }
			}
		    }
		}
	    }
	}
    }

  #ifdef DEBUG_FFT
  
  double norm_density = 0.;
  for(i = 0; i < fftsize; i++)
  {
  	norm_density += rhogrid[i]*rhogrid[i];
  }
  
  /* Write out rhogrid to a 'fft-snapshot' file */
  if (ThisTask == 0) {
 	 FILE *fp;
 	 
 	 /* Print the norm of the fft */
 	 printf("L2-norm of density: %f\n", norm_density);
 	 printf("First five values of density: %f, %f, %f, %f, %f\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3], rhogrid[4]);
 	 
  }
  #endif

  /* Do the FFT of the density field */
  #ifdef FFTW3
  fftw_execute_dft_r2c(fft_forward_plan, rhogrid, fft_of_rhogrid);
  #else
  rfftwnd_mpi(fft_forward_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif

  #ifdef DEBUG_FFT
  
  double norm_complex = 0.;
  for(i = 0; i < fftsize; i++)
  {
  	norm_complex += rhogrid[i]*rhogrid[i];
  }
  
  /* Write out rhogrid to a 'fft-snapshot' file */
  if (ThisTask == 0) {
 	 FILE *fp;
 	 
 	 /* Print the norm of the fft */
 	 printf("L2-norm of complex rhogrid: %f\n", norm_complex);
 	 printf("First two values of complex fft: %f + i*%f, %f + i*%f\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3]);
 	 
  }
  #endif

  /* multiply with Green's function for the potential */

  for(y = slabstart_y; y < slabstart_y + nslab_y; y++)
    for(x = 0; x < PMGRID; x++)
      for(z = 0; z < PMGRID / 2 + 1; z++)
	{
	  if(x > PMGRID / 2)
	    kx = x - PMGRID;
	  else
	    kx = x;
	  if(y > PMGRID / 2)
	    ky = y - PMGRID;
	  else
	    ky = y;
	  if(z > PMGRID / 2)
	    kz = z - PMGRID;
	  else
	    kz = z;

	  k2 = kx * kx + ky * ky + kz * kz;

	  if(k2 > 0)
	    {
	      smth = -exp(-k2 * asmth2) / k2;

	      /* do deconvolution */

	      fx = fy = fz = 1;
	      if(kx != 0)
		{
		  fx = (M_PI * kx) / PMGRID;
		  fx = sin(fx) / fx;
		}
	      if(ky != 0)
		{
		  fy = (M_PI * ky) / PMGRID;
		  fy = sin(fy) / fy;
		}
	      if(kz != 0)
		{
		  fz = (M_PI * kz) / PMGRID;
		  fz = sin(fz) / fz;
		}
	      ff = 1 / (fx * fy * fz);
	      smth *= ff * ff * ff * ff;

	      /* end deconvolution */

	      ip = PMGRID * (PMGRID / 2 + 1) * (y - slabstart_y) + (PMGRID / 2 + 1) * x + z;
	      c_re(fft_of_rhogrid[ip]) *= smth;
	      c_im(fft_of_rhogrid[ip]) *= smth;
	    }
	}

  if(slabstart_y == 0)
    c_re(fft_of_rhogrid[0]) = c_im(fft_of_rhogrid[0]) = 0.0;

  /* Do the FFT to get the potential */
  #ifdef FFTW3
  fftw_execute_dft_c2r(fft_inverse_plan, fft_of_rhogrid, rhogrid);
  
  /* Now normalize the output 
  for(i = 0; i < fftsize; i++)
	rhogrid[i] = rhogrid[i] / (PMGRID*PMGRID*PMGRID); */
  #else
  rfftwnd_mpi(fft_inverse_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif
  
  #ifdef DEBUG_FFT
  
  double norm = 0.;
  for(i = 0; i < fftsize; i++)
  {
  	norm += rhogrid[i]*rhogrid[i];
  }
  
  /* Write out rhogrid to a 'fft-snapshot' file */
  if (ThisTask == 0) {
 	 FILE *fp;
 	 
 	 /* Print the norm of the fft */
 	 printf("L2-norm of rhogrid: %f\n", norm);
 	 printf("First five values of fft: %f, %f, %f, %f, %f\n\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3],rhogrid[4]);
 	 
 	 /* fp = fopen("rhogrid.0", "wb"); /* add suffix to indicate which node this is */
 	 /* fwrite(rhogrid, sizeof(rhogrid[0]), fftsize, fp);*/
 	 /* fclose(fp); */
  }
  #endif

  /* Now rhogrid holds the potential */
  /* construct the potential for the local patch */


  dimx = meshmax[0] - meshmin[0] + 6;
  dimy = meshmax[1] - meshmin[1] + 6;
  dimz = meshmax[2] - meshmin[2] + 6;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;

      if(recvTask < NTask)
	{

	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -PMGRID;
	  for(slab_x = meshmin_list[3 * recvTask] - 2; slab_x < meshmax_list[3 * recvTask] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == sendTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -PMGRID)
	    sendmin = sendmax + 1;


	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -PMGRID;
	  for(slab_x = meshmin[0] - 2; slab_x < meshmax[0] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == recvTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -PMGRID)
	    recvmin = recvmax + 1;

	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 6;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 6;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 6;

	      ncont = 1;
	      cont_sendmin[0] = sendmin;
	      cont_sendmax[0] = sendmax;
	      cont_sendmin[1] = sendmax + 1;
	      cont_sendmax[1] = sendmax;

	      cont_recvmin[0] = recvmin;
	      cont_recvmax[0] = recvmax;
	      cont_recvmin[1] = recvmax + 1;
	      cont_recvmax[1] = recvmax;

	      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
		    {
		      /* non-contiguous */
		      cont_sendmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
			slab_x++;
		      cont_sendmin[1] = slab_x;
		      ncont++;
		    }
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
		    {
		      /* non-contiguous */
		      cont_recvmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
			slab_x++;
		      cont_recvmin[1] = slab_x;
		      if(ncont == 1)
			ncont++;
		    }
		}


	      for(rep = 0; rep < ncont; rep++)
		{
		  sendmin = cont_sendmin[rep];
		  sendmax = cont_sendmax[rep];
		  recvmin = cont_recvmin[rep];
		  recvmax = cont_recvmax[rep];

		  /* prepare what we want to send */
		  if(sendmax - sendmin >= 0)
		    {
		      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
			{
			  slab_xx = ((slab_x + PMGRID) % PMGRID) - first_slab_of_task[ThisTask];

			  for(slab_y = meshmin_list[3 * recvTask + 1] - 2;
			      slab_y < meshmax_list[3 * recvTask + 1] + 4; slab_y++)
			    {
			      slab_yy = (slab_y + PMGRID) % PMGRID;

			      for(slab_z = meshmin_list[3 * recvTask + 2] - 2;
				  slab_z < meshmax_list[3 * recvTask + 2] + 4; slab_z++)
				{
				  slab_zz = (slab_z + PMGRID) % PMGRID;

				  forcegrid[((slab_x - sendmin) * recv_dimy +
					     (slab_y - (meshmin_list[3 * recvTask + 1] - 2))) * recv_dimz +
					    slab_z - (meshmin_list[3 * recvTask + 2] - 2)] =
				    rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz];
				}
			    }
			}
		    }

		  if(level > 0)
		    {
		      MPI_Sendrecv(forcegrid,
				   (sendmax - sendmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real),
				   MPI_BYTE, recvTask, TAG_PERIODIC_B,
				   workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
				   (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE,
				   recvTask, TAG_PERIODIC_B, MPI_COMM_WORLD, &status);
		    }
		  else
		    {
		      memcpy(workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
			     forcegrid, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real));
		    }
		}
	    }
	}
    }


  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  recv_dimx = meshmax[0] - meshmin[0] + 6;
  recv_dimy = meshmax[1] - meshmin[1] + 6;
  recv_dimz = meshmax[2] - meshmin[2] + 6;


  for(dim = 0; dim < 3; dim++)	/* Calculate each component of the force. */
    {
      /* get the force component by finite differencing the potential */
      /* note: "workspace" now contains the potential for the local patch, plus a suffiently large buffer region */

      for(x = 0; x < meshmax[0] - meshmin[0] + 2; x++)
	for(y = 0; y < meshmax[1] - meshmin[1] + 2; y++)
	  for(z = 0; z < meshmax[2] - meshmin[2] + 2; z++)
	    {
	      xrr = xll = xr = xl = x;
	      yrr = yll = yr = yl = y;
	      zrr = zll = zr = zl = z;

	      switch (dim)
		{
		case 0:
		  xr = x + 1;
		  xrr = x + 2;
		  xl = x - 1;
		  xll = x - 2;
		  break;
		case 1:
		  yr = y + 1;
		  yl = y - 1;
		  yrr = y + 2;
		  yll = y - 2;
		  break;
		case 2:
		  zr = z + 1;
		  zl = z - 1;
		  zrr = z + 2;
		  zll = z - 2;
		  break;
		}

	      forcegrid[(x * dimy + y) * dimz + z]
		=
		fac * ((4.0 / 3) *
		       (workspace[((xl + 2) * recv_dimy + (yl + 2)) * recv_dimz + (zl + 2)]
			- workspace[((xr + 2) * recv_dimy + (yr + 2)) * recv_dimz + (zr + 2)]) -
		       (1.0 / 6) *
		       (workspace[((xll + 2) * recv_dimy + (yll + 2)) * recv_dimz + (zll + 2)] -
			workspace[((xrr + 2) * recv_dimy + (yrr + 2)) * recv_dimz + (zrr + 2)]));
	    }

      /* read out the forces */

      for(i = 0; i < NumPart; i++)
	{
	  slab_x = to_slab_fac * P[i].Pos[0];
	  if(slab_x >= PMGRID)
	    slab_x = PMGRID - 1;
	  dx = to_slab_fac * P[i].Pos[0] - slab_x;
	  slab_x -= meshmin[0];
	  slab_xx = slab_x + 1;

	  slab_y = to_slab_fac * P[i].Pos[1];
	  if(slab_y >= PMGRID)
	    slab_y = PMGRID - 1;
	  dy = to_slab_fac * P[i].Pos[1] - slab_y;
	  slab_y -= meshmin[1];
	  slab_yy = slab_y + 1;

	  slab_z = to_slab_fac * P[i].Pos[2];
	  if(slab_z >= PMGRID)
	    slab_z = PMGRID - 1;
	  dz = to_slab_fac * P[i].Pos[2] - slab_z;
	  slab_z -= meshmin[2];
	  slab_zz = slab_z + 1;

	  acc_dim =
	    forcegrid[(slab_x * dimy + slab_y) * dimz + slab_z] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
	  acc_dim += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_z] * (1.0 - dx) * dy * (1.0 - dz);
	  acc_dim += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_zz] * (1.0 - dx) * (1.0 - dy) * dz;
	  acc_dim += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_zz] * (1.0 - dx) * dy * dz;

	  acc_dim += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_z] * (dx) * (1.0 - dy) * (1.0 - dz);
	  acc_dim += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_z] * (dx) * dy * (1.0 - dz);
	  acc_dim += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_zz] * (dx) * (1.0 - dy) * dz;
	  acc_dim += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_zz] * (dx) * dy * dz;

	  P[i].GravPM[dim] = acc_dim;
	}
    }

  pm_init_periodic_free();
  force_treeallocate(All.TreeAllocFactor * All.MaxPart, All.MaxPart);

  All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency;

  if(ThisTask == 0)
    {
      printf("done PM.\n");
      fflush(stdout);
    }
  
  #ifdef FFTW3
  if(fftw_plan_exists)
  {
	  /* macro defined in callgrind.h */
	  // CALLGRIND_STOP_INSTRUMENTATION;
  }
  #else
  // CALLGRIND_STOP_INSTRUMENTATION;
  #endif
}
//removes the lowest energy vertical seam from the image
void removeVerticalSeam() {
    double energies[3];
    double min_energy;
    int prev_x;
    int prev_y;
    
    // split up work between processes
    double *my_path_costs;
    double *my_previous_x;
    double *my_previous_y;
    double *temp_path_costs;
    double *temp_previous_x;
    double *temp_previous_y;
    int my_cols = current_width / numprocs;
    int low_cols = my_cols;
    int extra_cols = current_width % numprocs;
    int start;
    int x_offset;
    int recv_cols;

    double left_end_cost, right_end_cost, temp_end_cost;

    if (rank < extra_cols) {
        my_cols++;
        start = rank * my_cols;
    } else {
        start = (extra_cols * (my_cols + 1)) + ((rank - extra_cols) * my_cols);
    }

    //printf("%d %d %d\n", rank, start, my_cols);
    
    my_path_costs = (double *) malloc(my_cols * current_height * sizeof(double));
    my_previous_x = (double *) malloc(my_cols * current_height * sizeof(double));
    my_previous_y = (double *) malloc(my_cols * current_height * sizeof(double));
    //find the lowest cost seam by computing the lowest cost paths to each pixel
    for (int y = 0; y < current_height; y++) {
        //compute the path costs for my columns     
        for (int x = start; x < start + my_cols; x++) {
            //printf("%d %d %d %d %d\n", rank, x, y, (x - start) * current_height + y, my_cols * current_height);
            if (y == 0) {
                path_costs[x * initial_height] = image_energy[x * initial_height];
                my_path_costs[(x - start) * current_height + y] = path_costs[x * initial_height];

                previous_x[x * initial_height] = -1;
                my_previous_x[(x - start) * current_height + y] = previous_x[x * initial_height];

                previous_y[x * initial_height] = -1;
                my_previous_y[(x - start) * current_height + y] = previous_y[x * initial_height];
            } else {
                //the pixel directly above
                energies[1] = path_costs[x * initial_height + y - 1];
                //pixel above to the left
                if (x != 0) {
                    energies[0] = path_costs[(x - 1) * initial_height + y - 1];
                } else {
                    energies[0] = DBL_MAX;
                }
                //pixel above to the right
                if (x != current_width - 1) {
                    energies[2] = path_costs[(x + 1) * initial_height + y - 1];
                } else {
                    energies[2] = DBL_MAX;
                }

                //find the one with the least path cost
                min_energy = energies[0];
                prev_x = x - 1;
                prev_y = y - 1;
                if (energies[1] < min_energy) {
                    min_energy = energies[1];
                    prev_x = x;
                }
                if (energies[2] < min_energy) {
                    min_energy = energies[2];
                    prev_x = x + 1;
                }

                //set the minimum path cost for this pixel
                path_costs[x * initial_height + y] = min_energy + image_energy[x * initial_height + y];
                my_path_costs[(x - start) * current_height + y] = path_costs[x * initial_height + y];

                //set the previous pixel on the minimum path's coordinates for this pixel
                previous_x[x * initial_height + y] = prev_x;
                my_previous_x[(x - start) * current_height + y] = previous_x[x * initial_height + y];

                previous_y[x * initial_height + y] = prev_y;
                my_previous_y[(x - start) * current_height + y] = previous_y[x * initial_height + y];
            }
        }

        //send path cost needed to neighboring processes
        if (numprocs > 1) {
            if (rank != numprocs - 1) {
                //send rightmost cost to following process
                right_end_cost = path_costs[(start + my_cols - 1) * initial_height + y];        
                MPI_Send(&right_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD);

                //receive following process's leftmost cost
                MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                path_costs[(start + my_cols) * initial_height + y] = temp_end_cost;
            }
            if (rank != 0) {
                //send leftmost cost to preceding process
                left_end_cost = path_costs[start * initial_height + y];
                MPI_Send(&left_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD);

                //receive preceding process's rightmost cost
                MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                path_costs[(start - 1) * initial_height + y] = temp_end_cost;
            }            
        }
    }

    //update path costs and previous for all processes
    for (int i = 0; i < numprocs; i++) {
        if (rank == i) {
            continue;
        }

        if (i < extra_cols) {
            x_offset = i * (low_cols + 1);
            recv_cols = (low_cols + 1);
        } else {
            x_offset = (extra_cols * (low_cols + 1)) + ((i - extra_cols) * low_cols);
            recv_cols = low_cols;
        }

        //printf("%d %d\n", low_cols, extra_cols);
        //printf("%d %d %d\n", rank, x_offset, recv_cols);

        temp_path_costs = (double *) malloc(recv_cols * current_height * sizeof(double));
        temp_previous_x = (double *) malloc(recv_cols * current_height * sizeof(double));
        temp_previous_y = (double *) malloc(recv_cols * current_height * sizeof(double));
        MPI_Sendrecv(my_path_costs, my_cols * current_height, MPI_DOUBLE, i, 0, 
            temp_path_costs, recv_cols * current_height, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, 
            MPI_STATUS_IGNORE);
        MPI_Sendrecv(my_previous_x, my_cols * current_height, MPI_DOUBLE, i, 1, 
            temp_previous_x, recv_cols * current_height, MPI_DOUBLE, i, 1, MPI_COMM_WORLD, 
            MPI_STATUS_IGNORE);
        MPI_Sendrecv(my_previous_y, my_cols * current_height, MPI_DOUBLE, i, 2, 
            temp_previous_y, recv_cols * current_height, MPI_DOUBLE, i, 2, MPI_COMM_WORLD, 
            MPI_STATUS_IGNORE);
        /* problem is here */
        for (int j = 0; j < recv_cols * current_height; j++) {
            int x = x_offset + (j % recv_cols);
            int y = j / recv_cols;
            //printf("%d %d %d %d %d\n", rank, x, y, x * initial_height + y, recv_cols * current_height);
            //printf("%d\n", initial_height * initial_width);
            path_costs[x * initial_height + y] = temp_path_costs[(x - x_offset) * current_height + y];
            previous_x[x * initial_height + y] = temp_previous_x[(x - x_offset) * current_height + y];
            previous_y[x * initial_height + y] = temp_previous_y[(x - x_offset) * current_height + y];
        }
        free(temp_path_costs);
        free(temp_previous_x);
        free(temp_previous_y);
    }
    free(my_path_costs);
    free(my_previous_x);
    free(my_previous_y);

    //printf("here\n");
    
    //find the xcoord the lowest cost seam starts at the bottom of the current image
    int x_coord = 0;
    for (int x = 0; x < current_width; x++) {
        if (path_costs[x * initial_height + current_height - 1] < path_costs[x_coord * initial_height + current_height - 1]) {
            x_coord = x;
        }
    }

    //printf("here\n");


    //delete the seam from the bottom up
    for (int y = current_height - 1; y >= 0; y--) {
        //delete this pixel by copying over it and all those following to the right
        for (int x = x_coord; x < current_width - 1; x++) {
            image[x * initial_height + y] = image[(x + 1) * initial_height + y];
        }
        //next pixel
        //printf("%d\n", x_coord * initial_height + y);
        x_coord = previous_x[x_coord * initial_height + y];
        //printf("%d %d\n", rank, x_coord);
    }

    //decrease the current width of the image
    current_width--;
}
Beispiel #6
0
void star_density(void)
{
  int j;

#ifdef EDDINGTON_TENSOR_STARS  
  int i, dummy;
  int ngrp, sendTask, recvTask, place, nexport, nimport, ndone, ndone_flag;
#endif

  /* clear Je in all gas particles */

  for(j = 0; j < N_gas; j++)
    {
      if(P[j].Type == 0)
	SphP[j].Je = 0;

#ifdef SFR
      if(P[j].Type == 0)
	{
	  SphP[j].Je += SphP[j].Sfr * All.IonizingLumPerSFR *
	    (PROTONMASS / (P[j].Mass * All.UnitMass_in_g / All.HubbleParam)) * All.UnitTime_in_s /
	    All.HubbleParam;	  
	}
#endif
    }

#ifdef EDDINGTON_TENSOR_STARS

  /* allocate buffers to arrange communication */

  Ngblist = (int *) mymalloc(NumPart * sizeof(int));

  All.BunchSize =
    (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) +
					     2 * sizeof(struct stardata_in)));
  DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index));
  DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist));

  i = FirstActiveParticle;	/* beginn with this index */

  do
    {
      for(j = 0; j < NTask; j++)
	{
	  Send_count[j] = 0;
	  Exportflag[j] = -1;
	}

      /* do local particles and prepare export list */
      for(nexport = 0; i >= 0; i = NextActiveParticle[i])
	{
	  if(P[i].Type == 4)
	    {
	      if(star_density_evaluate(i, 0, &nexport, Send_count) < 0)
		break;
	    }
	}

#ifdef MYSORT
      mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#else
      qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#endif

      MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD);

      for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++)
	{
	  Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask];
	  nimport += Recv_count[j];

	  if(j > 0)
	    {
	      Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
	      Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
	    }
	}

      StarDataGet = (struct stardata_in *) mymalloc(nimport * sizeof(struct stardata_in));
      StarDataIn = (struct stardata_in *) mymalloc(nexport * sizeof(struct stardata_in));

      /* prepare particle data for export */
      for(j = 0; j < nexport; j++)
	{
	  place = DataIndexTable[j].Index;

	  StarDataIn[j].Pos[0] = P[place].Pos[0];
	  StarDataIn[j].Pos[1] = P[place].Pos[1];
	  StarDataIn[j].Pos[2] = P[place].Pos[2];
	  StarDataIn[j].Hsml = PPP[place].Hsml;
	  StarDataIn[j].Density = P[place].DensAroundStar;
	  StarDataIn[j].Mass = P[place].Mass;

	  memcpy(StarDataIn[j].NodeList,
		 DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int));

	}

      /* exchange particle data */
      for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	{
	  sendTask = ThisTask;
	  recvTask = ThisTask ^ ngrp;

	  if(recvTask < NTask)
	    {
	      if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		{
		  /* get the particles */
		  MPI_Sendrecv(&StarDataIn[Send_offset[recvTask]],
			       Send_count[recvTask] * sizeof(struct stardata_in), MPI_BYTE,
			       recvTask, TAG_DENS_A,
			       &StarDataGet[Recv_offset[recvTask]],
			       Recv_count[recvTask] * sizeof(struct stardata_in), MPI_BYTE,
			       recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		}
	    }
	}

      myfree(StarDataIn);


      /* now do the particles that were sent to us */

      for(j = 0; j < nimport; j++)
	star_density_evaluate(j, 1, &dummy, &dummy);

      /* check whether this is the last iteration */
      if(i < 0)
	ndone_flag = 1;
      else
	ndone_flag = 0;

      MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

      myfree(StarDataGet);
    }
  while(ndone < NTask);

  myfree(DataNodeList);
  myfree(DataIndexTable);
  myfree(Ngblist);

#endif //for EDDINGTON_TENSOR_STARS

}
Beispiel #7
0
int main( int argc, char **argv )
{
    MPI_Datatype *types;
    void         **inbufs, **outbufs;
    char         **names;
    int          *counts, *bytesize, ntype;
    MPI_Comm     comms[20];
    int          ncomm = 20, rank, np, partner, tag;
    int          i, j, k, err, toterr, world_rank, errloc;
    MPI_Status   status, statuses[2];
    int          flag, index;
    char         *obuf;
    MPI_Request  requests[2];


    MPI_Init( &argc, &argv );

    AllocateForData( &types, &inbufs, &outbufs, &counts, &bytesize, 
		     &names, &ntype );
    GenerateData( types, inbufs, outbufs, counts, bytesize, names, &ntype );

    MPI_Comm_rank( MPI_COMM_WORLD, &world_rank );
    MakeComms( comms, 20, &ncomm, 0 );

	
/* Test over a wide range of datatypes and communicators */
    err = 0;
    for (i=0; i<ncomm; i++) {
	MPI_Comm_rank( comms[i], &rank );
	MPI_Comm_size( comms[i], &np );
	if (np < 2) continue;
	tag = i;
	for (j=0; j<ntype; j++) {
		if (world_rank == 0){ 
/* SI make size of outputindependent of number of processes */
		  if (i<2) fprintf( stdout, "Testing type %s\n",names[j] );
		}
	    /* This test does an irsend between both partners, with 
	       a sendrecv after the irecv used to guarentee that the
	       irsend has a matching receive
	       */
	    if (rank == 0) {
		partner = np - 1;
#if 0
		MPIR_PrintDatatypePack( stdout, counts[j], types[j], 0, 0 );
#endif
		obuf = outbufs[j];
		for (k=0; k<bytesize[j]; k++) 
		    obuf[k] = 0;
	    
		MPI_Irecv(outbufs[j], counts[j], types[j], partner, tag, 
			  comms[i], &requests[0] );

		MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, 
			      MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, 
			      comms[i], &status );

		MPI_Irsend( inbufs[j], counts[j], types[j], partner, tag, 
			    comms[i], &requests[1] );
	    
		do {
		    MPI_Waitany( 2, requests, &index, &status );
		} while (index != 0);

		/* Always the possiblity that the Irsend is still waiting */
		MPI_Waitall( 2, requests, statuses );
		if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) {
		    char *p1, *p2;
		    fprintf( stderr, 
			     "Error in data with type %s (type %d on %d) at byte %d\n", 
			     names[j], j, world_rank, errloc - 1 );
		    p1 = (char *)inbufs[j];
		    p2 = (char *)outbufs[j];
		    fprintf( stderr, 
			     "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] );
		    err++;
#if 0
		    MPIR_PrintDatatypeUnpack( stderr, counts[j], types[j], 
					      0, 0 );
#endif
		}
	    }
	    else if (rank == np - 1) {
		partner = 0;
		obuf = outbufs[j];
		for (k=0; k<bytesize[j]; k++) 
		    obuf[k] = 0;
	    
		MPI_Irecv(outbufs[j], counts[j], types[j], partner, tag, 
			  comms[i], &requests[0] );

		MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, 
			      MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, 
			      comms[i], &status );

		/* Wait for irecv to complete */
		do {
		    MPI_Test( &requests[0], &flag, &status );
		} while (!flag);
		if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) {
		    char *p1, *p2;
		    fprintf( stderr, 
			     "Error in data with type %s (type %d on %d) at byte %d\n", 
			     names[j], j, world_rank, errloc - 1 );
		    p1 = (char *)inbufs[j];
		    p2 = (char *)outbufs[j];
		    fprintf( stderr, 
			     "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] );
		    err++;
#if 0
		    MPIR_PrintDatatypeUnpack( stderr, counts[j], types[j], 
					      0, 0 );
#endif
		}

		MPI_Irsend( inbufs[j], counts[j], types[j], partner, tag, 
			    comms[i], &requests[1] );
	    
		MPI_Waitall(1, &requests[1], &status );
	    }
	}
    }

    if (err > 0) {
	fprintf( stderr, "%d errors on %d\n", err, rank );
    }
    MPI_Allreduce( &err, &toterr, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD );
    if (world_rank == 0) {
	if (toterr == 0) {
	    printf( " No Errors\n" );
	}
	else {
	    printf (" Found %d errors\n", toterr );
	}
    }
    FreeDatatypes( types, inbufs, outbufs, counts, bytesize, names, ntype );
    FreeComms( comms, ncomm );
    MPI_Finalize();

    return err;
}
int main (int argc, char *argv[])
{
  int my_rank, size;
  int right, left;

  int   int_send_buf, int_recv_buf, int_sum, i;
  float float_send_buf, float_recv_buf, float_sum;

  int          array_of_blocklengths[COUNT];
  MPI_Aint     array_of_displacements[COUNT], first_var_address, second_var_address;
  MPI_Datatype array_of_types[COUNT], sendtype, recvtype;

  MPI_Status  status;


  /* Get process and neighbour info. */
  MPI_Init(&argc, &argv);

  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  right = (my_rank+1)      % size;
  left  = (my_rank-1+size) % size;
/* ... this SPMD-style neighbor computation with modulo has the same meaning as: */
/* right = my_rank + 1;          */
/* if (right == size) right = 0; */
/* left = my_rank - 1;           */
/* if (left == -1) left = size-1;*/

  /* Set MPI datatypes for sending and receiving partial sums. */

  array_of_blocklengths[0] = 1;
  array_of_blocklengths[1] = 1;

  MPI_Address(&int_send_buf, &first_var_address);
  MPI_Address(&float_send_buf, &second_var_address);

  array_of_displacements[0] = (MPI_Aint) 0;
  array_of_displacements[1] = second_var_address - first_var_address;

  array_of_types[0] = MPI_INT;
  array_of_types[1] = MPI_FLOAT;

  MPI_Type_struct(COUNT, array_of_blocklengths, array_of_displacements, array_of_types, &sendtype);

  MPI_Type_commit(&sendtype);

  MPI_Address(&int_recv_buf, &first_var_address);
  MPI_Address(&float_recv_buf, &second_var_address);

  array_of_displacements[0] = (MPI_Aint) 0;
  array_of_displacements[1] = second_var_address - first_var_address;

  MPI_Type_struct(COUNT, array_of_blocklengths, array_of_displacements, array_of_types, &recvtype);

  MPI_Type_commit(&recvtype);

  /* Compute global sum. */
  int_sum = 0;
  float_sum = 0;
  int_send_buf = my_rank;
  float_send_buf = (float) my_rank;

  for( i = 0; i < size; i++) 
  {
    MPI_Sendrecv(&int_send_buf, 1, sendtype, right, to_right,
                 &int_recv_buf, 1, recvtype, left, to_right,
                 MPI_COMM_WORLD, &status);
    
    int_send_buf = int_recv_buf;
    float_send_buf = float_recv_buf;

    int_sum += int_recv_buf;
    float_sum += float_recv_buf;
  }

  printf ("PE%i:\tSum = %i\t%f\n", my_rank, int_sum, float_sum);

  MPI_Finalize();
}
Beispiel #9
0
static int
kmr_alltoall_bruck(KMR *mr, void *sbuf, void *rbuf, int cnt)
{
#define DUMP_(X0,X1,X2,X3,X4) if (tracing) kmr_atoa_dump_(X0,X1,X2,X3,X4)
    MPI_Comm comm = mr->comm;
    int nprocs = mr->nprocs;
    int rank = mr->rank;
    int tag = KMR_TAG_ATOA;
    _Bool tracing = mr->trace_alltoall;
    assert((nprocs & 3) == 0);
    int nprocs4th = (nprocs / 4);
    int cc;

    int lognprocs = 0;
    while ((1 << lognprocs) < nprocs) {
	lognprocs++;
    }
    assert((1 << lognprocs) == nprocs);

    char *buf0 = kmr_malloc((size_t)(cnt * nprocs));
    char *buf1 = kmr_malloc((size_t)(cnt * nprocs));
    memcpy(buf0, sbuf, (size_t)(cnt * nprocs));

    MPI_Request rqs[6];
    for (int stage = 0; stage < lognprocs; stage += 2) {
	DUMP_(mr, buf0, cnt, "step", stage);
	for (int j = 0; j < nprocs4th; j++) {
	    for (int i = 0; i < 4; i++) {
		void *s = &buf0[cnt * (i + (j * 4))];
		void *r = &buf1[cnt * (nprocs4th * i + j)];
		memcpy(r, s, (size_t)cnt);
	    }
	}
	DUMP_(mr, buf1, cnt, "pack", stage);
	for (int k = 0; k < 4; k++) {
	    int flip = (k << stage);
	    int peer = (rank ^ flip);
	    int baserank = ((rank >> stage) & 3);
	    int basepeer = ((peer >> stage) & 3);
	    if (k == 0) {
		void *s = &buf1[cnt * (baserank * nprocs4th)];
		void *r = &buf0[cnt * (baserank * nprocs4th)];
		memcpy(r, s, (size_t)(cnt * nprocs4th));
	    } else {
		void *s = &buf1[cnt * (basepeer * nprocs4th)];
		void *r = &buf0[cnt * (basepeer * nprocs4th)];
#if 0
		cc = MPI_Sendrecv(s, (cnt * nprocs4th), MPI_BYTE, peer, tag,
				  r, (cnt * nprocs4th), MPI_BYTE, peer, tag,
				  comm, MPI_STATUS_IGNORE);
		assert(cc == MPI_SUCCESS);
#else
		cc = MPI_Isend(s, (cnt * nprocs4th), MPI_BYTE, peer, tag,
			       comm, &rqs[(k - 1) * 2 + 1]);
		assert(cc == MPI_SUCCESS);
		cc = MPI_Irecv(r, (cnt * nprocs4th), MPI_BYTE, peer, tag,
			       comm, &rqs[(k - 1) * 2]);
		assert(cc == MPI_SUCCESS);
#endif
	    }
	}
	cc = MPI_Waitall(6, rqs, MPI_STATUSES_IGNORE);
	assert(cc == MPI_SUCCESS);
	DUMP_(mr, buf0, cnt, "exchange", stage);
    }
    memcpy(rbuf, buf0, (size_t)(cnt * nprocs));
    kmr_free(buf0, (size_t)(cnt * nprocs));
    kmr_free(buf1, (size_t)(cnt * nprocs));
    return MPI_SUCCESS;
}
int main(int argc, char** argv)
{
    int iter_max = 1000;
    
    const float pi  = 2.0 * asinf(1.0f);
    const float tol = 1.0e-5f;

    int rank = 0;
    int size = 1;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    memset(A, 0, N * M * sizeof(float));
    memset(Aref, 0, N * M * sizeof(float));
    
    // set boundary conditions
    for (int j = 0; j < N; j++)
    {
        float y0     = sinf( 2.0 * pi * j / (N-1));
        A[j][0]      = y0;
        A[j][M-1]    = y0;
        Aref[j][0]   = y0;
        Aref[j][M-1] = y0;
    }
    
#if _OPENACC
    int ngpus=acc_get_num_devices(acc_device_nvidia);
    int devicenum=rank%ngpus;
    acc_set_device_num(devicenum,acc_device_nvidia);

    // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems
    acc_init(acc_device_nvidia);
#endif /*_OPENACC*/

    // Ensure correctness if N%size != 0
    int chunk_size = ceil( (1.0*N)/size );
    
    int jstart = rank * chunk_size;
    int jend   = jstart + chunk_size;
    
    // Do not process boundaries
    jstart = max( jstart, 1 );
    jend = min( jend, N - 1 );
    
    if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M);

    if ( rank == 0) printf("Calculate reference solution and time serial execution.\n");
    StartTimer();
    laplace2d_serial( rank, iter_max, tol );
    double runtime_serial = GetTimer();

    //Wait for all processes to ensure correct timing of the parallel version
    MPI_Barrier( MPI_COMM_WORLD );
    if ( rank == 0) printf("Parallel execution.\n");
    StartTimer();
    int iter  = 0;
    float error = 1.0f;
    
    #pragma acc data copy(A) create(Anew)
    while ( error > tol && iter < iter_max )
    {
        error = 0.f;

        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1]
                                     + A[j-1][i] + A[j+1][i]);
                error = fmaxf( error, fabsf(Anew[j][i]-A[j][i]));
            }
        }
        float globalerror = 0.0f;
        MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD );
        error = globalerror;
        
        #pragma acc kernels
        for (int j = jstart; j < jend; j++)
        {
            for( int i = 1; i < M-1; i++ )
            {
                A[j][i] = Anew[j][i];
            }
        }

        //Periodic boundary conditions
        int top    = (rank == 0) ? (size-1) : rank-1;
        int bottom = (rank == (size-1)) ? 0 : rank+1;

        #pragma acc host_data use_device( A )
        {
            //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom
            MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top   , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );

            //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top
            MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top   , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE );
        }
        
        if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error);
        
        iter++;
    }
    MPI_Barrier( MPI_COMM_WORLD );
    double runtime = GetTimer();

    if (check_results( rank, jstart, jend, tol ) && rank == 0)
    {
        printf( "Num GPUs: %d\n", size );
        printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 );
    }
    MPI_Finalize();
    return 0;
}
int main(int argc, char *argv[])
{ 
    MPI_Status     status;               /* MPI status                          */
    int            mpierr;               /* MPI function return code            */
    int            rank;                 /* Process rank within MPI_COMM_WORLD  */
    int            size;
    int            dest, src;
    int            tag0=41;              /* MPI message tag                     */
    
    int            inject;
    int            report;
    int            iterations;
    int            n_bytes; 
    unsigned char* send_buff;
    unsigned char* recv_buff;
    char*          tmp;
    
    int            i, j, count;
    
    float fraction, randval;
    struct timeval tp;
    
    if (1 < argc) {
        if (0 == strncmp(argv[1], "-h", 2) ||
            0 == strncmp(argv[1], "--h", 3)) {
            printf("Usage: mpirun --options-- ./sendrecv_blaster <options> where options are:\n"
                   "\tpattern=[self | pair | ring] where\n"
                   "\t\tself => sendrecv with self\n"
                   "\t\tpair => sendrecv with a complementary partner [0 <-> N-1, 1 <-> N-2...]\n"
                   "\t\tring [default] => sendrecv around a ring [0 recvs from N-1 and sends to 1]\n"
                   "\tsize=[value < 0 => max message size in kbytes, value > 0 => max message size in Mbytes (default=1MByte)]\n"
                   "\tinject=[value = #iterations before injecting MPI_Sendrecv to self (default: never)]\n"
                   "\treport=[value = #iterations/reporting point (default: 1000)\n"
                   "\titerations=[value = #iterations before stopping (default: 1000000)\n");
            return 0;
        }
    }
    
    mpierr = MPI_Init(&argc, &argv);
    if (mpierr != MPI_SUCCESS)
    {
        fprintf(stderr, "MPI Error %d (MPI_Init)\n",mpierr);
        fflush(stderr);
        MPI_Abort(MPI_COMM_WORLD, -1);
    }
    
    MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
    
    mpierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (mpierr != MPI_SUCCESS || rank < 0)
    {
        fprintf(stderr, "MPI Error %d (MPI_Comm_rank)\n",mpierr);
        fflush(stderr);
        MPI_Abort(MPI_COMM_WORLD, -1);
    }
    
    mpierr = MPI_Comm_size(MPI_COMM_WORLD, &size);
    if (mpierr != MPI_SUCCESS || size < 0)
    {
        fprintf(stderr, "MPI Error %d (MPI_Comm_size)\n",mpierr);
        fflush(stderr);
        MPI_Abort(MPI_COMM_WORLD, -1);
    }

    /* setup defaults in lieu of args */
    n_bytes = 1024*1024;
    inject = -1;
    report = 1000;
    iterations = 1000000;
    /* do a ring */
    src = rank - 1;
    if (src < 0) {
        src = size - 1;
    }
    dest = rank + 1;
    if (dest > size-1) {
        dest = 0;
    }
    
    for (i=1; i < argc; i++) {
        fprintf(stderr, "got %s\n", argv[i]);
        if (0 == strncmp(argv[i], "pattern", strlen("pattern"))) {
            tmp = strchr(argv[i], '=');
            tmp++;
            if (0 == strcmp(tmp, "self")) {
                /* just do it with myself */
                src = rank;
                dest = rank;
            } else if (0 == strcmp(tmp, "pair")) {
                /* do it pair-wise */
                src = (size-1) - rank;
                dest = src;
            } else {
                /* do a ring */
                src = rank - 1;
                if (src < 0) {
                    src = size - 1;
                }
                dest = rank + 1;
                if (dest > size-1) {
                    dest = 0;
                }
            }
        } else if (0 == strncmp(argv[i], "size", strlen("size"))) {
            tmp = strchr(argv[i], '=');
            tmp++;
            n_bytes = atoi(tmp);
            if (n_bytes < 0) {
                n_bytes = -1 * n_bytes * 1024;
            } else {
                n_bytes = n_bytes * 1024*1024;
            }
        } else if (0 == strncmp(argv[i], "inject", strlen("inject"))) {
            tmp = strchr(argv[i], '=');
            tmp++;
            inject = atoi(tmp);
        } else if (0 == strncmp(argv[i], "report", strlen("report"))) {
            tmp = strchr(argv[i], '=');
            tmp++;
            report = atoi(tmp);
        } else if (0 == strncmp(argv[i], "iter", strlen("iter"))) {
            tmp = strchr(argv[i], '=');
            tmp++;
            iterations = atoi(tmp);
        }
    }

    send_buff = (unsigned char *) valloc(n_bytes);
    recv_buff = (unsigned char *) valloc(n_bytes);
    
    /* seed the random number generator */
    gettimeofday (&tp, NULL);
    srand (tp.tv_usec);

    for ( i=0; i<n_bytes; i++ )
    {
        send_buff[i] = i%128;
    }
    
    fprintf(stderr, "Rank %d: recving from src %d sending to dest %d with max buff size %dKbytes\n",
            rank, src, dest, n_bytes/1024);

    i=0;
    while (i < iterations)
    {
        randval = rand();
        fraction = randval/RAND_MAX;
        count = fraction * n_bytes;
        mpierr = MPI_Sendrecv(send_buff, count, MPI_CHAR, dest, tag0,
                              recv_buff, n_bytes, MPI_CHAR, src, tag0, MPI_COMM_WORLD, &status);
        if (mpierr != MPI_SUCCESS)
        {
            fprintf(stderr,"MPI Error %d (MPI_Sendrecv) [%d,%d] at iteration %d\n",mpierr,src,dest,i);
            fflush(stderr);
            MPI_Abort(MPI_COMM_WORLD, -1);
        }
        i++;
        if (0 == (i % report)) {
            fprintf(stderr, "Rank %d has completed %dk iterations\n", rank, i/1000);
        }
        if (0 < inject && 0 == (i % inject)) {
            mpierr = MPI_Sendrecv(send_buff, count, MPI_CHAR, rank, tag0,
                                  recv_buff, n_bytes, MPI_CHAR, rank, tag0, MPI_COMM_WORLD, &status);
            if (mpierr != MPI_SUCCESS)
            {
                fprintf(stderr,"MPI Error %d (MPI_Sendrecv) [%d,%d] at iteration %d\n",mpierr,rank,rank,i);
                fflush(stderr);
                MPI_Abort(MPI_COMM_WORLD, -1);
            } else {
                fprintf(stderr, "Rank %d has completed MPI_Sendrecv with myself\n", rank);
            }
        }
    }
    
    fprintf(stderr, "Rank %d completed test\n", rank);
    MPI_Finalize();
}
Beispiel #12
0
/*! This function computes the local density for each active SPH particle,
 *  the number of neighbours in the current smoothing radius, and the
 *  divergence and curl of the velocity field.  The pressure is updated as
 *  well.  If a particle with its smoothing region is fully inside the
 *  local domain, it is not exported to the other processors. The function
 *  also detects particles that have a number of neighbours outside the
 *  allowed tolerance range. For these particles, the smoothing length is
 *  adjusted accordingly, and the density computation is executed again.
 *  Note that the smoothing length is not allowed to fall below the lower
 *  bound set by MinGasHsml.
 */
void density(void)
{
  long long ntot, ntotleft;
  int *noffset, *nbuffer, *nsend, *nsend_local, *numlist, *ndonelist;
  int i, j, n, ndone, npleft, maxfill, source, iter = 0;
  int level, ngrp, sendTask, recvTask, place, nexport;
  double dt_entr, tstart, tend, tstart_ngb = 0, tend_ngb = 0;
  double sumt, sumcomm, timengb, sumtimengb;
  double timecomp = 0, timeimbalance = 0, timecommsumm = 0, sumimbalance;
  MPI_Status status;
  double a3;
#ifdef METALS_TG
  int metal_disperse;
  long long ntotsave;
  double a, hubble_param, hubble_a, dt, exp_func, old_met, old_met_tot, new_met, new_met_tot, M_metals_local, M_metals_tot;

  if(All.ComovingIntegrationOn)
    {
      a = All.Time;

      hubble_param = All.HubbleParam;

      hubble_a = All.Omega0 / (All.Time * All.Time * All.Time) + (1 - All.Omega0 - All.OmegaLambda) / (All.Time * All.Time) + All.OmegaLambda;

      hubble_a = All.Hubble * sqrt(hubble_a);
    }
  else
    a = hubble_a = hubble_param = 1.0;
#endif

  if(All.ComovingIntegrationOn)
    {
       a3 = All.Time * All.Time * All.Time;
    }
  else
    a3 = 1.0;


#ifdef PERIODIC
  boxSize = All.BoxSize;
  boxHalf = 0.5 * All.BoxSize;
#ifdef LONG_X
  boxHalf_X = boxHalf * LONG_X;
  boxSize_X = boxSize * LONG_X;
#endif
#ifdef LONG_Y
  boxHalf_Y = boxHalf * LONG_Y;
  boxSize_Y = boxSize * LONG_Y;
#endif
#ifdef LONG_Z
  boxHalf_Z = boxHalf * LONG_Z;
  boxSize_Z = boxSize * LONG_Z;
#endif
#endif


  noffset = malloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
  nbuffer = malloc(sizeof(int) * NTask);
  nsend_local = malloc(sizeof(int) * NTask);
  nsend = malloc(sizeof(int) * NTask * NTask);
  ndonelist = malloc(sizeof(int) * NTask);

  for(n = 0, NumSphUpdate = 0; n < N_gas; n++)
    {
      if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/
      	continue;
      SphP[n].Left = SphP[n].Right = 0;

      if(P[n].Ti_endstep == All.Ti_Current)
	NumSphUpdate++;
    }

  numlist = malloc(NTask * sizeof(int) * NTask);
  MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);
  for(i = 0, ntot = 0; i < NTask; i++)
    ntot += numlist[i];
  free(numlist);

#ifdef METALS_TG
  ntotsave = ntot;

  M_metals_tot = M_metals_local = 0.0;

  for(metal_disperse = 0; metal_disperse < 2; metal_disperse++) {

  ntot = ntotsave;
#endif

  /* we will repeat the whole thing for those particles where we didn't
   * find enough neighbours
   */
  do
    {
      i = 0;			/* begin with this index */
      ntotleft = ntot;		/* particles left for all tasks together */
      while(ntotleft > 0)
	{
	  for(j = 0; j < NTask; j++)
	    nsend_local[j] = 0;

	  /* do local particles and prepare export list */
	  tstart = second();
	  for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeDensity - NTask; i++) {
	    if(P[i].ID < 0 || SphP[i].sink > 0.5) /*SINK*/
	     continue;

	    if(P[i].Ti_endstep == All.Ti_Current)
	      {
		ndone++;

		for(j = 0; j < NTask; j++)
		  Exportflag[j] = 0;

#ifdef METALS_TG
		density_evaluate(i, 0, metal_disperse);
#else
		density_evaluate(i, 0);
#endif

		for(j = 0; j < NTask; j++)
		  {
		    if(Exportflag[j])
		      {
			DensDataIn[nexport].Pos[0] = P[i].Pos[0];
			DensDataIn[nexport].Pos[1] = P[i].Pos[1];
			DensDataIn[nexport].Pos[2] = P[i].Pos[2];
			DensDataIn[nexport].Vel[0] = SphP[i].VelPred[0];
			DensDataIn[nexport].Vel[1] = SphP[i].VelPred[1];
			DensDataIn[nexport].Vel[2] = SphP[i].VelPred[2];

                        DensDataIn[nexport].bfield[0] = SphP[i].bfield[0];
                        DensDataIn[nexport].bfield[1] = SphP[i].bfield[1];
                        DensDataIn[nexport].bfield[2] = SphP[i].bfield[2];
                        
			DensDataIn[nexport].Hsml = SphP[i].Hsml;
                        DensDataIn[nexport].Sci = SphP[i].Sci;
#ifdef METALS_TG
                        DensDataIn[nexport].Sigma = SphP[i].Sigma;
#endif
			DensDataIn[nexport].Index = i;
			DensDataIn[nexport].Task = j;
			nexport++;
			nsend_local[j]++;
		      }
		  }
	      }
	  }
	  tend = second();
	  timecomp += timediff(tstart, tend);

	  qsort(DensDataIn, nexport, sizeof(struct densdata_in), dens_compare_key);

	  for(j = 1, noffset[0] = 0; j < NTask; j++)
	    noffset[j] = noffset[j - 1] + nsend_local[j - 1];

	  tstart = second();

	  MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD);

	  tend = second();
	  timeimbalance += timediff(tstart, tend);


	  /* now do the particles that need to be exported */

	  for(level = 1; level < (1 << PTask); level++)
	    {
	      tstart = second();
	      for(j = 0; j < NTask; j++)
		nbuffer[j] = 0;
	      for(ngrp = level; ngrp < (1 << PTask); ngrp++)
		{
		  maxfill = 0;
		  for(j = 0; j < NTask; j++)
		    {
		      if((j ^ ngrp) < NTask)
			if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
			  maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		    }
		  if(maxfill >= All.BunchSizeDensity)
		    break;

		  sendTask = ThisTask;
		  recvTask = ThisTask ^ ngrp;

		  if(recvTask < NTask)
		    {
		      if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
			{
			  /* get the particles */
			  MPI_Sendrecv(&DensDataIn[noffset[recvTask]],
				       nsend_local[recvTask] * sizeof(struct densdata_in), MPI_BYTE,
				       recvTask, TAG_DENS_A,
				       &DensDataGet[nbuffer[ThisTask]],
				       nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_in),
				       MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, &status);
			}
		    }

		  for(j = 0; j < NTask; j++)
		    if((j ^ ngrp) < NTask)
		      nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
		}
	      tend = second();
	      timecommsumm += timediff(tstart, tend);


	      tstart = second();
	      for(j = 0; j < nbuffer[ThisTask]; j++)
#ifdef METALS_TG
		density_evaluate(j, 1, metal_disperse);
#else
		density_evaluate(j, 1);
#endif
	      tend = second();
	      timecomp += timediff(tstart, tend);

	      /* do a block to explicitly measure imbalance */
	      tstart = second();
	      MPI_Barrier(MPI_COMM_WORLD);
	      tend = second();
	      timeimbalance += timediff(tstart, tend);

	      /* get the result */
	      tstart = second();
	      for(j = 0; j < NTask; j++)
		nbuffer[j] = 0;
	      for(ngrp = level; ngrp < (1 << PTask); ngrp++)
		{
		  maxfill = 0;
		  for(j = 0; j < NTask; j++)
		    {
		      if((j ^ ngrp) < NTask)
			if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
			  maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		    }
		  if(maxfill >= All.BunchSizeDensity)
		    break;

		  sendTask = ThisTask;
		  recvTask = ThisTask ^ ngrp;

		  if(recvTask < NTask)
		    {
		      if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
			{
			  /* send the results */
			  MPI_Sendrecv(&DensDataResult[nbuffer[ThisTask]],
				       nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_out),
				       MPI_BYTE, recvTask, TAG_DENS_B,
				       &DensDataPartialResult[noffset[recvTask]],
				       nsend_local[recvTask] * sizeof(struct densdata_out),
				       MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, &status);

			  /* add the result to the particles */
			  for(j = 0; j < nsend_local[recvTask]; j++)
			    {
			      source = j + noffset[recvTask];
			      place = DensDataIn[source].Index;
#ifdef METALS_TG
			      if(metal_disperse == 1) {
		              SphP[place].const_A += DensDataPartialResult[source].const_A;
		              SphP[place].const_B += DensDataPartialResult[source].const_B;
			      }
			      else {
#endif
			      SphP[place].NumNgb += DensDataPartialResult[source].Ngb;
			      SphP[place].Density += DensDataPartialResult[source].Rho;
			      SphP[place].DivVel += DensDataPartialResult[source].Div;
                              SphP[place].DivB += DensDataPartialResult[source].Div_B;

			      SphP[place].DhsmlDensityFactor += DensDataPartialResult[source].DhsmlDensity;

			      SphP[place].Rot[0] += DensDataPartialResult[source].Rot[0];
			      SphP[place].Rot[1] += DensDataPartialResult[source].Rot[1];
			      SphP[place].Rot[2] += DensDataPartialResult[source].Rot[2];

                              SphP[place].GradSci[0] += DensDataPartialResult[source].GSci[0];
                              SphP[place].GradSci[1] += DensDataPartialResult[source].GSci[1];
                              SphP[place].GradSci[2] += DensDataPartialResult[source].GSci[2];

                              SphP[place].VelRel[0] += DensDataPartialResult[source].VRel[0];
                              SphP[place].VelRel[1] += DensDataPartialResult[source].VRel[1];
                              SphP[place].VelRel[2] += DensDataPartialResult[source].VRel[2];
#ifdef METALS_TG
			      SphP[place].Sigma += DensDataPartialResult[source].Sigma;
			      }
#endif
			    }
			}
		    }

		  for(j = 0; j < NTask; j++)
		    if((j ^ ngrp) < NTask)
		      nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
		}
	      tend = second();
	      timecommsumm += timediff(tstart, tend);

	      level = ngrp - 1;
	    }

	  MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD);
	  for(j = 0; j < NTask; j++)
	    ntotleft -= ndonelist[j];
	}


      /* do final operations on results */
      tstart = second();
#ifdef METALS_TG
      if(metal_disperse == 0) {
#endif
      for(i = 0, npleft = 0; i < N_gas; i++)
	{
	  if(P[i].ID < 0 || SphP[i].sink > 0.5) /*SINK*/
	    continue;
	  if(P[i].Ti_endstep == All.Ti_Current)
	    {
	      {
		SphP[i].DhsmlDensityFactor =
		  1.0 / (1.0 + SphP[i].Hsml * SphP[i].DhsmlDensityFactor / ((double)(NUMDIMS) * SphP[i].Density));

		SphP[i].CurlVel = sqrt(SphP[i].Rot[0] * SphP[i].Rot[0] +
				       SphP[i].Rot[1] * SphP[i].Rot[1] +
				       SphP[i].Rot[2] * SphP[i].Rot[2]) / SphP[i].Density;

		SphP[i].DivVel /= SphP[i].Density;
                SphP[i].DivB /= SphP[i].Density;
                SphP[i].GradSci[0] /= SphP[i].Density;
                SphP[i].GradSci[1] /= SphP[i].Density;
                SphP[i].GradSci[2] /= SphP[i].Density;
                SphP[i].VelRel[0] /= SphP[i].Density;
                SphP[i].VelRel[1] /= SphP[i].Density;
                SphP[i].VelRel[2] /= SphP[i].Density;
#ifdef POLYTROPE
                SphP[i].Pressure = get_pressure(SphP[i].Density);
#else /* POLYTROPE */
		dt_entr = (double)((All.Ti_Current - (P[i].Ti_begstep + P[i].Ti_endstep) / 2.0)) * All.Timebase_interval;
#ifdef CHEMCOOL
		SphP[i].Pressure =
		  (SphP[i].Entropy + SphP[i].DtEntropy * dt_entr) * pow(SphP[i].Density, SphP[i].Gamma);
#else /* CHEMCOOL */
		SphP[i].Pressure =
		  (SphP[i].Entropy + SphP[i].DtEntropy * dt_entr) * pow(SphP[i].Density, GAMMA);
#endif /* CHEMCOOL */
#endif /* POLYTROPE */
	      }

	      /* now check whether we had enough neighbours */

	      if(SphP[i].NumNgb < (All.DesNumNgb - All.MaxNumNgbDeviation) ||
		 (SphP[i].NumNgb > (All.DesNumNgb + All.MaxNumNgbDeviation)
		  && SphP[i].Hsml > (1.01 * All.MinGasHsml)))
		{
		  /* need to redo this particle */
		  npleft++;

		  if(SphP[i].Left > 0 && SphP[i].Right > 0)
		    if((SphP[i].Right - SphP[i].Left) < 1.0e-3 * SphP[i].Left)
		      {
			/* this one should be ok */
			npleft--;
			P[i].Ti_endstep = -P[i].Ti_endstep - 1;	/* Mark as inactive */
			continue;
		      }

		  if(SphP[i].NumNgb < (All.DesNumNgb - All.MaxNumNgbDeviation))
		    SphP[i].Left = dmax(SphP[i].Hsml, SphP[i].Left);
		  else
		    {
		      if(SphP[i].Right != 0)
			{
			  if(SphP[i].Hsml < SphP[i].Right)
			    SphP[i].Right = SphP[i].Hsml;
			}
		      else
			SphP[i].Right = SphP[i].Hsml;
		    }

		  if(iter >= MAXITER - 10)
		    {
		      printf
			("i=%d task=%d ID=%d Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n   pos=(%g|%g|%g)\n",
			 i, ThisTask, (int) P[i].ID, SphP[i].Hsml, SphP[i].Left, SphP[i].Right,
			 (float) SphP[i].NumNgb, SphP[i].Right - SphP[i].Left, P[i].Pos[0], P[i].Pos[1],
			 P[i].Pos[2]);
		      fflush(stdout);
		    }

		  if(SphP[i].Right > 0 && SphP[i].Left > 0)
		    SphP[i].Hsml = pow(0.5 * (pow(SphP[i].Left, 3) + pow(SphP[i].Right, 3)), 1.0 / 3.0);
		  else
		    {
		      if(SphP[i].Right == 0 && SphP[i].Left == 0)
			endrun(8188);	/* can't occur */

		      if(SphP[i].Right == 0 && SphP[i].Left > 0)
			{
			  if(P[i].Type == 0 && fabs(SphP[i].NumNgb - All.DesNumNgb) < 0.5 * All.DesNumNgb)
			    {
			      SphP[i].Hsml *=
				1.0 - (SphP[i].NumNgb -
				     All.DesNumNgb) / ((double)(NUMDIMS) * SphP[i].NumNgb) * SphP[i].DhsmlDensityFactor;
			    }
			  else
			    SphP[i].Hsml *= 1.26;
			}

		      if(SphP[i].Right > 0 && SphP[i].Left == 0)
			{
			  if(P[i].Type == 0 && fabs(SphP[i].NumNgb - All.DesNumNgb) < 0.5 * All.DesNumNgb)
			    {
			      SphP[i].Hsml *=
				1.0 - (SphP[i].NumNgb -
				     All.DesNumNgb) / ((double)(NUMDIMS) * SphP[i].NumNgb) * SphP[i].DhsmlDensityFactor;
			    }
			  else
			    SphP[i].Hsml /= 1.26;
			}
		    }

		  if(SphP[i].Hsml < All.MinGasHsml)
		    SphP[i].Hsml = All.MinGasHsml;
		}
	      else
		P[i].Ti_endstep = -P[i].Ti_endstep - 1;	/* Mark as inactive */
	    }
	}
#ifdef METALS_TG
      }
      else {
 	ntot = 0;
      }
#endif
      tend = second();
      timecomp += timediff(tstart, tend);


#ifdef METALS_TG
      if(metal_disperse == 0) {
#endif
      numlist = malloc(NTask * sizeof(int) * NTask);
      MPI_Allgather(&npleft, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);
      for(i = 0, ntot = 0; i < NTask; i++)
	ntot += numlist[i];
      free(numlist);

      if(ntot > 0)
	{
	  if(iter == 0)
	    tstart_ngb = second();

	  iter++;

	  if(iter > 0 && ThisTask == 0)
	    {
	      printf("ngb iteration %d: need to repeat for %d%09d particles.\n", iter,
		     (int) (ntot / 1000000000), (int) (ntot % 1000000000));
	      fflush(stdout);
	    }

	  if(iter > MAXITER)
	    {
	      printf("failed to converge in neighbour iteration in density()\n");
	      fflush(stdout);
	      endrun(1155);
	    }
	}
      else
	tend_ngb = second();
#ifdef METALS_TG
      }
#endif
    }
  while(ntot > 0);

  /* mark as active again */
  for(i = 0; i < NumPart; i++)
    {
      if(P[i].Type == 0 && P[i].ID < 0) /*ACCRETED*/
        continue;

      if(P[i].Ti_endstep < 0)
        P[i].Ti_endstep = -P[i].Ti_endstep - 1;
    }
#ifdef METALS_TG
  if(metal_disperse == 0)
    {
      for(n = 0; n < N_gas; n++)
        {
          if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/
            continue;

          if(P[n].Ti_endstep == All.Ti_Current)
            {
              SphP[n].Sigma = sqrt(a*SphP[n].Sigma/SphP[n].NumNgb);
            }
        }
    }
  }

  old_met = old_met_tot = new_met = new_met_tot = 0.0;

  for(n = 0; n < N_gas; n++)
    {
      if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/
        continue;

      if(P[n].Ti_endstep == All.Ti_Current)
        {
          dt = (All.Ti_Current-P[n].Ti_begstep)*All.Timebase_interval/hubble_a;

          old_met += SphP[n].Metallicity*P[n].Mass;

          exp_func = exp(-SphP[n].const_A*dt);

          SphP[n].Metallicity *= exp_func;

          SphP[n].Metallicity += SphP[n].const_B/SphP[n].const_A*(1.0-exp_func);

          new_met += SphP[n].Metallicity*P[n].Mass;
        }
    }

  MPI_Allreduce(&old_met, &old_met_tot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

  MPI_Allreduce(&new_met, &new_met_tot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

  for(n = 0; n < N_gas; n++)
    {
      if(P[n].ID < 0 || SphP[n].sink > 0.5)  /*SINK*/
        continue;

      if(P[n].Ti_endstep == All.Ti_Current && new_met_tot > 0.0)
        {
          SphP[n].Metallicity *= old_met_tot/new_met_tot;
        }

      M_metals_local += SphP[n].Metallicity*P[n].Mass;
    }

  MPI_Allreduce(&M_metals_local, &M_metals_tot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);

  if(ThisTask == 0)
    printf("Metal mass = %g\n", M_metals_tot*All.UnitMass_in_g/SOLAR_MASS*Z_SOLAR/hubble_param);
#endif

  free(ndonelist);
  free(nsend);
  free(nsend_local);
  free(nbuffer);
  free(noffset);


  /* collect some timing information */
  if(iter > 0)
    timengb = timediff(tstart_ngb, tend_ngb);
  else
    timengb = 0;

  MPI_Reduce(&timengb, &sumtimengb, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

  if(ThisTask == 0)
    {
      All.CPU_HydCompWalk += sumt / NTask;
      All.CPU_HydCommSumm += sumcomm / NTask;
      All.CPU_HydImbalance += sumimbalance / NTask;
      All.CPU_EnsureNgb += sumtimengb / NTask;
    }
}
Beispiel #13
0
int main(int argc, char *argv[] ) {
	double time1, time2;
	
	time1 = MPI_Wtime();

	int rank, processors;
	
	int j;	// number of iterations
	int k;	// number of iterations to perform before creating a checkpoint
	int l;  // number of random samples per grid point
	int checkpoint_resume = 0;	// 1 = resume from last checkpoint

	int c;		// used to hold a character
	int i=0, row = 0, col = 0, pln = 0;	// array iterators

	char ***local_array;		   
	char **local_array_2nd;		   
	char *local_array_pointer; 

	char ***local_array_copy;		   
	char **local_array_copy_2nd;		   
	char *local_array_copy_pointer; 

	char ***temp, *temp_pointer;
	
	int file_open_error;
	int command_line_incomplete = 0;

	int grid_size[3] 	  = {0,0,0};
	int proc_size[3] 	  = {0,0,0};
	int local_size[3] 	  = {0,0,0};
	int remainder_size[3] = {0,0,0};
	int coords[3] 		  = {0,0,0};
	int start_indices[3]  = {0,0,0};
	int periods[3]        = {0,0,0};
	int mem_size[3]       = {0,0,0};
	
	MPI_Status status;
	MPI_Datatype filetype, memtype;
	MPI_File fh;
	
	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &processors);	
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);	

	// Interpret the command line arguments --------------------------------
  	if (rank == 0) {  	  	
		
		if (argc < 6 || argc > 8) {
			fputs("usage: x y z j k l r\n", stderr);
			fputs("where: x,y,z = x, y and z dimensions\n", stderr);
			fputs("       j = how many times the game of life is played\n", stderr);
			fputs("       k = checkpoint every k iterations\n", stderr);
			fputs("       l = number of random samples per grid point\n", stderr);
			fputs("       r = resume from the last checkpoint\n", stderr);
			fputs(INITIAL, stderr);
			fputs(" must be present.\n", stderr);
			fputs(CHECKPOINT, stderr);
			fputs(" must be present if resuming from the last checkpoint.\n", stderr);
			exit(EXIT_FAILURE);
		}

  	}

	j = (int) strtol(argv[4], NULL, 10);
	k = (int) strtol(argv[5], NULL, 10);
	l = (int) strtol(argv[6], NULL, 10);		
	if ( argc == 7 )
		if ( argv[6][0] == 'r' )
			checkpoint_resume = 1;			

	if (rank == 0)
		printf("%d iterations \ncheckpoint every %d iterations \n%d samples per grid point \ncheckpoint resume = %d\n", j,k,l,checkpoint_resume);				
	
	grid_size[0] = (int) strtol(argv[1], NULL, 10);
	grid_size[1] = (int) strtol(argv[2], NULL, 10);
	grid_size[2] = (int) strtol(argv[3], NULL, 10);
	if (rank==0) printf("grid_size: %d, %d, %d\n", grid_size[0], grid_size[1], grid_size[2]);

	MPI_Dims_create(processors, 3, proc_size);
	if (rank==0) printf("proc_size: %d, %d, %d\n", proc_size[0], proc_size[1], proc_size[2]);

	local_size[0] = grid_size[0] / proc_size[0];
	local_size[1] = grid_size[1] / proc_size[1];
	local_size[2] = grid_size[2] / proc_size[2];
	if (rank==0) printf("local_size: %d, %d, %d\n", local_size[0], local_size[1], local_size[2]);

	remainder_size[0] = grid_size[0] % proc_size[0];
	remainder_size[1] = grid_size[1] % proc_size[1];
	remainder_size[2] = grid_size[2] % proc_size[2];
	if (rank==0) printf("remainder_size: %d, %d, %d\n", remainder_size[0], remainder_size[1], remainder_size[2]);
	if (remainder_size[0] != 0 || remainder_size[1] != 0 || remainder_size[2] != 0) {
		fputs("remainder size != 0, check your dimensions", stderr);
		MPI_Finalize();
		exit(EXIT_FAILURE);
	}

	MPI_Comm comm;
	MPI_Cart_create(MPI_COMM_WORLD, 3, proc_size, periods, 0, &comm);
	MPI_Comm_rank(comm, &rank);
	MPI_Cart_coords(comm, rank, 3, coords);

	start_indices[0] = coords[0] * local_size[0];
	start_indices[1] = coords[1] * local_size[1];
	start_indices[2] = coords[2] * local_size[2];

/*	printf("A coords R%d: (%d, %d, %d)  (%d, %d, %d)\n", rank, coords[0], coords[1], coords[2], start_indices[0], start_indices[1], start_indices[2]);*/
	fflush(stdout);
	
	// create the file type ---------------------------------------------------
	MPI_Type_create_subarray(3, grid_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &filetype); 
	MPI_Type_commit(&filetype);
	
	// create a local memory type with ghost rows -----------------------------
	mem_size[0] = local_size[0] + 2; 
	mem_size[1] = local_size[1] + 2; 
	mem_size[2] = local_size[2] + 2; 
	start_indices[0] = start_indices[1] = start_indices[2] = 1;
	
	MPI_Type_create_subarray(3, mem_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &memtype);
	MPI_Type_commit(&memtype);
	
	// find my neighbors ------------------------------------------------------

	int nxminus, nxplus, nyminus, nyplus, nzminus, nzplus, tag = 333, *neighbors;

	// Neighbors Array:  row-  col-  col+  row+  plane-  plane+

	neighbors = (int *) malloc(6 * sizeof(int));
	for(i=0; i<6; i++)
		neighbors[i] = rank;

	MPI_Cart_shift(comm, 0, 1, &nxminus, &nxplus);
	MPI_Cart_shift(comm, 1, 1, &nyminus, &nyplus);
	MPI_Cart_shift(comm, 2, 1, &nzminus, &nzplus);

//	printf(" %d sending south to %d receiving from %d \n",rank,nxplus,nxminus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nxplus, tag, 
		&(neighbors[0]), 1, MPI_INT, nxminus, tag, comm, &status);

//	printf(" %d sending North to %d receiving from %d \n",rank,nxminus,nxplus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nxminus, tag, 
		&(neighbors[3]), 1, MPI_INT, nxplus, tag, comm, &status);

//	printf(" %d sending East to %d receiving from %d \n",rank,nyplus,nyminus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nyplus, tag, 
		&neighbors[1], 1, MPI_INT, nyminus, tag, comm, &status);

//	printf(" %d sending West to %d receiving from %d \n",rank,nyminus,nyplus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nyminus, tag, 
		&neighbors[2], 1, MPI_INT, nyplus, tag, comm, &status);

//	printf(" %d sending backwards to %d receiving from %d \n",rank,nzplus,nzminus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nzplus, tag, 
		&(neighbors[4]), 1, MPI_INT, nzminus, tag, comm, &status);

//	printf(" %d sending forward to %d receiving from %d \n",rank,nzminus,nzplus);
//	fflush(stdout);
	MPI_Sendrecv(&rank, 1, MPI_INT, nzminus, tag, 
		&(neighbors[5]), 1, MPI_INT, nzplus, tag, comm, &status);

/*	printf("neighboors R%d : (row-) %d (col-) %d (col+) %d (row+) %d (plane-) %d (plane+) %d\n",rank,neighbors[0],neighbors[1],neighbors[2],neighbors[3],neighbors[4],neighbors[5]);*/
	fflush(stdout);	

	//init_sprng(1,time(0),SPRNG_DEFAULT);
	srand((unsigned int)time(NULL));
		
	// Open the initial condition (checkpoint or not) ----------------------

	if ( checkpoint_resume ) {
		file_open_error = 
		MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
		MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
	}
	else {
		file_open_error = 
		MPI_File_open(MPI_COMM_WORLD, INITIAL, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh);
		MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
	}
	if (file_open_error != MPI_SUCCESS) {
		if (checkpoint_resume)
			fputs(CHECKPOINT, stderr);
		else
			fputs(INITIAL, stderr);
		fputs(" could not be opened.\n", stderr);
		exit(EXIT_FAILURE);
	}

	// Allocate and Populate the local array ----------------------------------
	
	local_array_copy_pointer = (char *)   malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char));
	local_array_copy_2nd     = (char **)  malloc(mem_size[0] * mem_size[1] * sizeof(char*));
	local_array_copy         = (char ***) malloc(mem_size[0] * sizeof(char*));
	for(i = 0; i < mem_size[0] * mem_size[1]; i++)
		local_array_copy_2nd[i] = &local_array_copy_pointer[i * mem_size[2]];
	for(i = 0; i < mem_size[0]; i++)
		local_array_copy[i] = &local_array_copy_2nd[i * mem_size[1]];

	local_array_pointer = (char *)   malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char));
	local_array_2nd  	= (char **)  malloc(mem_size[0] * mem_size[1] * sizeof(char*));
	local_array			= (char ***) malloc(mem_size[0] * sizeof(char*));
	for(i = 0; i < mem_size[0] * mem_size[1]; i++)
		local_array_2nd[i] = &local_array_pointer[i * mem_size[2]];
	for(i = 0; i < mem_size[0]; i++)
		local_array[i] = &local_array_2nd[i * mem_size[1]];
	
	// if (rank==0) printf("Malloc complete\n");
	
	for(row=0; row<mem_size[0]; row++) {
		for(col=0; col<mem_size[1]; col++) {
			for(pln=0; pln<mem_size[2]; pln++) {
				local_array[row][col][pln] = local_array_copy[row][col][pln] = '0';
			}
		}
	}
	
	// if (rank==0) printf("Setup complete\n");

	MPI_File_read_all(fh, local_array_pointer, 1, memtype, &status);

	if (rank==0) printf("File Read\n");
	
//	if (rank==0) {
//	for(row=0; row<mem_size[0]; row++) {
//		for(col=0; col<mem_size[1]; col++) {
//			for(pln=0; pln<mem_size[2]; pln++) {
//				printf("%c", local_array[row][col][pln]);
//			}
//			printf("\n");
//		}
//		printf("-----------------------\n");
//	}
//	}
	
	MPI_File_close(&fh);
		
	// Construct the plane data types
	
	MPI_Datatype yzplane;
	MPI_Type_vector(local_size[1], local_size[2], local_size[2]+2, MPI_CHAR, &yzplane);
	MPI_Type_commit(&yzplane);

	MPI_Datatype xzplane;
	MPI_Type_vector(local_size[0], local_size[2], ((local_size[2]+2)*local_size[1])+((local_size[2]+2)*2), MPI_CHAR, &xzplane);
	MPI_Type_commit(&xzplane);

	// this type will also copy the corner x columns, can't skip blocks intermittently
	// since we aren't worrying about the corner data, it's ok
	MPI_Datatype xyplane; 
	MPI_Type_vector((local_size[0]*local_size[1])+((local_size[0]*2)-2), 1, local_size[2]+2, MPI_CHAR, &xyplane);
	MPI_Type_commit(&xyplane);
					
	MPI_Barrier(comm);
	
	// start the iteration loop
	
	int iterations;
	int kCounter = k;
	for (iterations = 0; iterations < j; iterations++) {

		// send updated planes
		// Neighbors Array:  
		// 0     1     2     3     4       5
		// row-  col-  col+  row+  plane-  plane+
		// Note: corners are not handled
		
		// send top yzplane
		if (rank != neighbors[0]) MPI_Send(&local_array[1][1][1], 1, yzplane, neighbors[0], 0, comm);
		// recv bottom yzplane
		if (rank != neighbors[3]) MPI_Recv(&local_array[local_size[0]+1][1][1], 1, yzplane, neighbors[3], 0, comm, &status);

		// send bottom yzplane
		if (rank != neighbors[3]) MPI_Send(&local_array[local_size[0]][1][1], 1, yzplane, neighbors[3], 0, comm);
		// recv top yzplane
		if (rank != neighbors[0]) MPI_Recv(&local_array[0][1][1], 1, yzplane, neighbors[0], 0, comm, &status);

		// send left xzplane
		if (rank != neighbors[1]) MPI_Send(&local_array[1][1][1], 1, xzplane, neighbors[1], 0, comm);
		// recv right xzplane
		if (rank != neighbors[2]) MPI_Recv(&local_array[1][local_size[1]+1][1], 1, xzplane, neighbors[2], 0, comm, &status);

		// send right xzplane
		if (rank != neighbors[2]) MPI_Send(&local_array[1][local_size[1]][1], 1, xzplane, neighbors[2], 0, comm);
		// recv left xzplane
		if (rank != neighbors[1]) MPI_Recv(&local_array[1][0][1], 1, xzplane, neighbors[1], 0, comm, &status);

		// send front xyplane
		if (rank != neighbors[4]) MPI_Send(&local_array[1][1][1], 1, xyplane, neighbors[4], 0, comm);
		// recv back xyplane
		if (rank != neighbors[5]) MPI_Recv(&local_array[1][1][local_size[2]+1], 1, xyplane, neighbors[5], 0, comm, &status);

		// send back xyplane
		if (rank != neighbors[5]) MPI_Send(&local_array[1][1][local_size[2]], 1, xyplane, neighbors[5], 0, comm);
		// recv front xyplane
		if (rank != neighbors[4]) MPI_Recv(&local_array[1][1][0], 1, xyplane, neighbors[4], 0, comm, &status);

//		if (rank==0) {
//		for(row=0; row<mem_size[0]; row++) {
//			for(col=0; col<mem_size[1]; col++) {
//				for(pln=0; pln<mem_size[2]; pln++) {
//					printf("%c", local_array[row][col][pln]);
//				}
//				printf("\n");
//			}
//			printf("-----------------------\n");
//		}
//		}
		
		// run the game of life
		
		// gameOfLife(local_array, local_array_copy, local_size[0], local_size[1], l, rank);
		
		// swap the arrays
		
//		temp1 = local_array;
//		local_array = local_array_copy;
//		local_array_copy = temp1;
//
//		temp2 = local_array_pointer;
//		local_array_pointer = local_array_copy_pointer;
//		local_array_copy_pointer = temp2;		
	
		// check to see if this iteration needs a checkpoint
		
		kCounter--;
		if (kCounter == 0) {
			kCounter = k;
			
			// checkpoint code
			
			MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
			MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
			
			MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status);
			
			MPI_File_close(&fh); 	

			if (rank == 0)
				printf("Checkpoint made: Iteration %d\n", iterations+1);
			
		} // end if kCounter == 0 
	} // end iteration loop	
	iterations--;
	
	// all done! repeat the checkpoint process
	
	MPI_File_open(MPI_COMM_WORLD, FINAL_RESULTS, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
	MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL);
	
	MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status);
	
	MPI_File_close(&fh); 	

	if (rank == 0)
		printf("Final Results made: Iteration %d\n", iterations+1);
	
	time2 = MPI_Wtime();
	if (rank == 0)
	    printf("Elapsed Seconds: %f\n", time2-time1);fflush(stdout);
	
	MPI_Finalize(); 
	return EXIT_SUCCESS; 
}
Beispiel #14
0
void subfind_contamination(void)
{
    int i, j, ndone, ndone_flag, dummy, count;
    int ngrp, sendTask, recvTask, place, nexport, nimport;
    struct unbind_data *d;

    d = (struct unbind_data *) mymalloc(NumPart * sizeof(struct unbind_data));

    for(i = 0, count = 0; i < NumPart; i++)
#ifdef DENSITY_SPLIT_BY_TYPE
        if(!((1 << P[i].Type) & (DENSITY_SPLIT_BY_TYPE)))
#else
        if(!((1 << P[i].Type) & (FOF_PRIMARY_LINK_TYPES)))
#endif
            d[count++].index = i;

    force_treebuild(count, d);	/* construct tree only with boundary particles */

    myfree(d);


    /* allocate buffers to arrange communication */

    All.BunchSize =
        (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) +
                sizeof(struct contamdata_in) + sizeof(struct contamdata_out) +
                sizemax(sizeof(struct contamdata_in),
                        sizeof(struct contamdata_out))));
    DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index));
    DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist));


    /* we will repeat the whole thing for those groups where we didn't converge to a SO radius yet */

    i = 0;			/* begin with this index */

    do
    {
        for(j = 0; j < NTask; j++)
        {
            Send_count[j] = 0;
            Exportflag[j] = -1;
        }

        /* do local particles and prepare export list */

        for(nexport = 0; i < Ngroups; i++)
        {
            if(Group[i].R_Mean200 > 0)
            {
                if(subfind_contamination_evaluate(i, 0, &nexport, Send_count) < 0)
                    break;
            }
            else
            {
                Group[i].ContaminationLen = 0;
                Group[i].ContaminationMass = 0;
            }
        }

        qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);

        MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD);

        for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++)
        {
            Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask];
            nimport += Recv_count[j];

            if(j > 0)
            {
                Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
                Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
            }
        }

        ContamGet = (struct contamdata_in *) mymalloc(nimport * sizeof(struct contamdata_in));
        ContamIn = (struct contamdata_in *) mymalloc(nexport * sizeof(struct contamdata_in));

        /* prepare particle data for export */
        for(j = 0; j < nexport; j++)
        {
            place = DataIndexTable[j].Index;

            ContamIn[j].Pos[0] = Group[place].Pos[0];
            ContamIn[j].Pos[1] = Group[place].Pos[1];
            ContamIn[j].Pos[2] = Group[place].Pos[2];
            ContamIn[j].R200 = Group[place].R_Mean200;

            memcpy(ContamIn[j].NodeList,
                   DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int));
        }

        /* exchange data */
        for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
        {
            sendTask = ThisTask;
            recvTask = ThisTask ^ ngrp;

            if(recvTask < NTask)
            {
                if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
                {
                    /* get the data */
                    MPI_Sendrecv(&ContamIn[Send_offset[recvTask]],
                                 Send_count[recvTask] * sizeof(struct contamdata_in), MPI_BYTE,
                                 recvTask, TAG_DENS_A,
                                 &ContamGet[Recv_offset[recvTask]],
                                 Recv_count[recvTask] * sizeof(struct contamdata_in), MPI_BYTE,
                                 recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                }
            }
        }

        myfree(ContamIn);
        ContamResult = (struct contamdata_out *) mymalloc(nimport * sizeof(struct contamdata_out));
        ContamOut = (struct contamdata_out *) mymalloc(nexport * sizeof(struct contamdata_out));


        /* now do the locations that were sent to us */
        for(j = 0; j < nimport; j++)
            subfind_contamination_evaluate(j, 1, &dummy, &dummy);

        if(i >= Ngroups)
            ndone_flag = 1;
        else
            ndone_flag = 0;

        MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

        /* get the result */
        for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
        {
            sendTask = ThisTask;
            recvTask = ThisTask ^ ngrp;
            if(recvTask < NTask)
            {
                if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
                {
                    /* send the results */
                    MPI_Sendrecv(&ContamResult[Recv_offset[recvTask]],
                                 Recv_count[recvTask] * sizeof(struct contamdata_out),
                                 MPI_BYTE, recvTask, TAG_DENS_B,
                                 &ContamOut[Send_offset[recvTask]],
                                 Send_count[recvTask] * sizeof(struct contamdata_out),
                                 MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                }
            }
        }

        /* add the result to the local particles */
        for(j = 0; j < nexport; j++)
        {
            place = DataIndexTable[j].Index;
            Group[place].ContaminationLen += ContamOut[j].ContaminationLen;
            Group[place].ContaminationMass += ContamOut[j].ContaminationMass;
        }

        myfree(ContamOut);
        myfree(ContamResult);
        myfree(ContamGet);
    }
    while(ndone < NTask);


    myfree(DataNodeList);
    myfree(DataIndexTable);
}
Beispiel #15
0
double runBicomTest(int procs, int bufsize, int iters, int rank, 
                    int wsize, int procsPerNode, char allocPattern, 
                    int useBarrier, int useNearestRank, MPI_Comm *activeComm)
{
  int i, currtarg;
  double start, diff, max;
  char *sendbuf, *recvbuf;
  double totalops;
  MPI_Status stat;

  currtarg = getTargetRank(rank, wsize, procsPerNode, useNearestRank);
  diff     = 0;

  if ( isActiveProc(rank, wsize, procsPerNode, procs, 
                    allocPattern, useNearestRank) ) 
  {
    sendbuf  = (char*)malloc(bufsize);
    recvbuf  = (char*)malloc(bufsize);

    memset(sendbuf, 0, bufsize);
    memset(recvbuf, 0, bufsize);

    /*  Ensure communication paths have been initialized  */
    MPI_Sendrecv(sendbuf, bufsize, MPI_CHAR, currtarg, 0, 
                 recvbuf, bufsize, MPI_CHAR, currtarg, 0, 
                 MPI_COMM_WORLD, &stat);

    //generic_barrier(*activeComm);
    MPI_Barrier(*activeComm);
    //generic_barrier(*activeComm);
    MPI_Barrier(*activeComm);

    /*  Time operation loop  */
    start = MPI_Wtime();

    for ( i = 0; i < iters; i++ )
    {
      MPI_Sendrecv(sendbuf, bufsize, MPI_CHAR, currtarg, 0, 
                   recvbuf, bufsize, MPI_CHAR, currtarg, 0, 
                   MPI_COMM_WORLD, &stat);
    }

    if ( useBarrier )
      MPI_Barrier(*activeComm);
      //generic_barrier(*activeComm);

    diff = MPI_Wtime() - start;

    free(sendbuf);
    free(recvbuf);
  }

  MPI_Barrier(MPI_COMM_WORLD);

  /*  Get maximum sample length  */
  MPI_Reduce(&diff, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

  if ( rank == 0 )
  {
    totalops   = (double)(iters * procs);

    /*
      Bandwidth is calculated as :
      
        (# of processes * operations per time sample * message size)
        ------------------------------------------------------------
                  maximum sample length of all processes
    */

    if ( max > 0 )
    {
      printf(outputFormat, "Bidirectional", procs, bufsize, 
             ((double)totalops*(double)bufsize/max)/1000000,
             max/iters*1000000);
    }
    else
    {
      printf("Invalid measurement.  Increase number of operation per measurement.\n");
    }
  }

  return max;
}
Beispiel #16
0
void CSysMatrix::SendReceive_Solution(CSysVector & x, CGeometry *geometry, CConfig *config) {
  
  unsigned short iVar, iMarker, MarkerS, MarkerR;
  unsigned long iVertex, iPoint, nVertexS, nVertexR, nBufferS_Vector, nBufferR_Vector;
  double *Buffer_Receive = NULL, *Buffer_Send = NULL;
  int send_to, receive_from;
  
#ifdef HAVE_MPI
  MPI_Status status;
#endif
  
  for (iMarker = 0; iMarker < config->GetnMarker_All(); iMarker++) {
    
    if ((config->GetMarker_All_KindBC(iMarker) == SEND_RECEIVE) &&
        (config->GetMarker_All_SendRecv(iMarker) > 0)) {
      
      MarkerS = iMarker;  MarkerR = iMarker+1;
      
      send_to = config->GetMarker_All_SendRecv(MarkerS)-1;
      receive_from = abs(config->GetMarker_All_SendRecv(MarkerR))-1;
      
      nVertexS = geometry->nVertex[MarkerS];  nVertexR = geometry->nVertex[MarkerR];
      nBufferS_Vector = nVertexS*nVar;        nBufferR_Vector = nVertexR*nVar;
      
      /*--- Allocate Receive and send buffers  ---*/
      
      Buffer_Receive = new double [nBufferR_Vector];
      Buffer_Send = new double[nBufferS_Vector];
      
      /*--- Copy the solution that should be sended ---*/
      
      for (iVertex = 0; iVertex < nVertexS; iVertex++) {
        iPoint = geometry->vertex[MarkerS][iVertex]->GetNode();
        for (iVar = 0; iVar < nVar; iVar++)
          Buffer_Send[iVertex*nVar+iVar] = x[iPoint*nVar+iVar];
      }
      
#ifdef HAVE_MPI
      
      /*--- Send/Receive information using Sendrecv ---*/
	  MPI_Sendrecv(Buffer_Send, nBufferS_Vector, MPI_DOUBLE, send_to, 0,
                   Buffer_Receive, nBufferR_Vector, MPI_DOUBLE, receive_from, 0, MPI_COMM_WORLD, &status);

#else
      
      /*--- Receive information without MPI ---*/
      
      for (iVertex = 0; iVertex < nVertexR; iVertex++) {
        iPoint = geometry->vertex[MarkerR][iVertex]->GetNode();
        for (iVar = 0; iVar < nVar; iVar++)
          Buffer_Receive[iVar*nVertexR+iVertex] = Buffer_Send[iVar*nVertexR+iVertex];
      }
      
#endif
      
      /*--- Deallocate send buffer ---*/
      
      delete [] Buffer_Send;
      
      /*--- Do the coordinate transformation ---*/
      
      for (iVertex = 0; iVertex < nVertexR; iVertex++) {
        
        /*--- Find point and its type of transformation ---*/
        
        iPoint = geometry->vertex[MarkerR][iVertex]->GetNode();
        
        /*--- Copy transformed conserved variables back into buffer. ---*/
        
        for (iVar = 0; iVar < nVar; iVar++)
          x[iPoint*nVar+iVar] = Buffer_Receive[iVertex*nVar+iVar];
        
      }
      
      /*--- Deallocate receive buffer ---*/
      
      delete [] Buffer_Receive;
      
    }
    
  }
  
}
void dd_sendrecv2_rvec(const gmx_domdec_t gmx_unused *dd,
                       int gmx_unused ddimind,
                       rvec gmx_unused *buf_s_fw, int gmx_unused n_s_fw,
                       rvec gmx_unused *buf_r_fw, int gmx_unused n_r_fw,
                       rvec gmx_unused *buf_s_bw, int gmx_unused n_s_bw,
                       rvec gmx_unused *buf_r_bw, int gmx_unused n_r_bw)
{
#ifdef GMX_MPI
    int         rank_fw, rank_bw, nreq;
    MPI_Request req[4];
    MPI_Status  stat[4];

    rank_fw = dd->neighbor[ddimind][0];
    rank_bw = dd->neighbor[ddimind][1];

    if (!dd->bSendRecv2)
    {
        /* Try to send and receive in two directions simultaneously.
         * Should be faster, especially on machines
         * with full 3D communication networks.
         * However, it could be that communication libraries are
         * optimized for MPI_Sendrecv and non-blocking MPI calls
         * are slower.
         * SendRecv2 can be turned on with the env.var. GMX_DD_SENDRECV2
         */
        nreq = 0;
        if (n_r_fw)
        {
            MPI_Irecv(buf_r_fw[0], n_r_fw*sizeof(rvec), MPI_BYTE,
                      rank_bw, 0, dd->mpi_comm_all, &req[nreq++]);
        }
        if (n_r_bw)
        {
            MPI_Irecv(buf_r_bw[0], n_r_bw*sizeof(rvec), MPI_BYTE,
                      rank_fw, 1, dd->mpi_comm_all, &req[nreq++]);
        }
        if (n_s_fw)
        {
            MPI_Isend(buf_s_fw[0], n_s_fw*sizeof(rvec), MPI_BYTE,
                      rank_fw, 0, dd->mpi_comm_all, &req[nreq++]);
        }
        if (n_s_bw)
        {
            MPI_Isend(buf_s_bw[0], n_s_bw*sizeof(rvec), MPI_BYTE,
                      rank_bw, 1, dd->mpi_comm_all, &req[nreq++]);
        }
        if (nreq)
        {
            MPI_Waitall(nreq, req, stat);
        }
    }
    else
    {
        /* Communicate in two ordered phases.
         * This is slower, even on a dual-core Opteron cluster
         * with a single full-duplex network connection per machine.
         */
        /* Forward */
        MPI_Sendrecv(buf_s_fw[0], n_s_fw*sizeof(rvec), MPI_BYTE, rank_fw, 0,
                     buf_r_fw[0], n_r_fw*sizeof(rvec), MPI_BYTE, rank_bw, 0,
                     dd->mpi_comm_all, &stat[0]);
        /* Backward */
        MPI_Sendrecv(buf_s_bw[0], n_s_bw*sizeof(rvec), MPI_BYTE, rank_bw, 0,
                     buf_r_bw[0], n_r_bw*sizeof(rvec), MPI_BYTE, rank_fw, 0,
                     dd->mpi_comm_all, &stat[0]);
    }
#endif
}
Beispiel #18
0
int main( int argc, char **argv )
{
    MPI_Request r1;
    int         size, rank;
    int         err = 0;
    int         partner, buf[10], flag, idx, index;
    MPI_Status  status;

    MPI_Init( &argc, &argv );

    MPI_Comm_size( MPI_COMM_WORLD, &size );
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    
    if (size < 2) {
	printf( "Cancel test requires at least 2 processes\n" );
	MPI_Abort( MPI_COMM_WORLD, 1 );
    }

    /* 
     * Here is the test.  First, we ensure an unsatisfied Irecv:
     *       process 0             process size-1
     *       Sendrecv              Sendrecv
     *       Irecv                    ----
     *       Cancel                   ----
     *       Sendrecv              Sendrecv
     * Next, we confirm receipt before canceling
     *       Irecv                 Send
     *       Sendrecv              Sendrecv
     *       Cancel
     */
    if (rank == 0) {
	partner = size - 1;
	/* Cancel succeeds for wait/waitall */
	MPI_Recv_init( buf, 10, MPI_INT, partner, 0, MPI_COMM_WORLD, &r1 );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Start( &r1 );
	MPI_Cancel( &r1 );
	MPI_Wait( &r1, &status );
	MPI_Test_cancelled( &status, &flag );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	if (!flag) {
	    err++; 
	    printf( "Cancel of a receive failed where it should succeed (Wait).\n" );
	}

	MPI_Request_free( &r1 );

	/* Cancel fails for test/testall */
	buf[0] = -1;
	MPI_Recv_init( buf, 10, MPI_INT, partner, 2, MPI_COMM_WORLD, &r1 );
	MPI_Start( &r1 );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Cancel( &r1 );
	MPI_Test( &r1, &flag, &status );
	MPI_Test_cancelled( &status, &flag );
	if (flag) {
	    err++;
	    printf( "Cancel of a receive succeeded where it shouldn't (Test).\n" );
	    if (buf[0] != -1) {
		printf( "Receive buffer changed even though cancel suceeded! (Test).\n" );
	    }
	}
	MPI_Request_free( &r1 );

	/* Cancel succeeds for waitany */
	MPI_Recv_init( buf, 10, MPI_INT, partner, 0, MPI_COMM_WORLD, &r1 );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Start( &r1 );
	MPI_Cancel( &r1 );
	MPI_Waitany( 1, &r1, &idx, &status );
	MPI_Test_cancelled( &status, &flag );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	if (!flag) {
	    err++;
	    printf( "Cancel of a receive failed where it should succeed (Waitany).\n" );
	}
	MPI_Request_free( &r1 );

	/* Cancel fails for testany */
        buf[0] = -1;
	MPI_Recv_init( buf, 10, MPI_INT, partner, 2, MPI_COMM_WORLD, &r1 );
	MPI_Start( &r1 );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Cancel( &r1 );
	MPI_Testany( 1, &r1, &idx, &flag, &status );
	MPI_Test_cancelled( &status, &flag );
	if (flag) {
	    err++;
	    printf( "Cancel of a receive succeeded where it shouldn't (Testany).\n" );
	    if (buf[0] != -1) {
		printf( "Receive buffer changed even though cancel suceeded! (Test).\n" );
	    }
	}
	MPI_Request_free( &r1 );

	/* Cancel succeeds for waitsome */
	MPI_Recv_init( buf, 10, MPI_INT, partner, 0, MPI_COMM_WORLD, &r1 );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Start( &r1 );
	MPI_Cancel( &r1 );
	MPI_Waitsome( 1, &r1, &idx, &index, &status );
	MPI_Test_cancelled( &status, &flag );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	if (!flag) {
	    err++;
	    printf( "Cancel of a receive failed where it should succeed (Waitsome).\n" );
	}
	MPI_Request_free( &r1 );

	/* Cancel fails for testsome*/
        buf[0] = -1;
	MPI_Recv_init( buf, 10, MPI_INT, partner, 2, MPI_COMM_WORLD, &r1 );
	MPI_Start( &r1 );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Cancel( &r1 );
	MPI_Testsome( 1, &r1, &idx, &index, &status );
	MPI_Test_cancelled( &status, &flag );
	if (flag) {
	    err++;
	    printf( "Cancel of a receive succeeded where it shouldn't (Testsome).\n" );
	    if (buf[0] != -1) {
		printf( "Receive buffer changed even though cancel suceeded! (Testsome).\n" );
	    }
	}
	MPI_Request_free( &r1 );

	if (err) {
	    printf( "Test failed with %d errors.\n", err );
	}
	else {
	    printf( " No Errors\n" );
	}
    }

    else if (rank == size - 1) {
	partner = 0;
	/* Cancel succeeds for wait/waitall */
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	/* Cancel fails for test/testall */
	buf[0] = 3;
	MPI_Send( buf, 3, MPI_INT, partner, 2, MPI_COMM_WORLD );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );

	/* Cancel succeeds for waitany */
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	/* Cancel fails  for testany */
	MPI_Send( buf, 3, MPI_INT, partner, 2, MPI_COMM_WORLD );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );

	/* Cancel succeeds for waitsome */
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );
	/* Cancel fails  for waitsome */
	MPI_Send( buf, 3, MPI_INT, partner, 2, MPI_COMM_WORLD );
	MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_BOTTOM, 0, MPI_INT, partner, 1,
		      MPI_COMM_WORLD, &status );

    /* 
       Next test - check that a cancel for a request receive from
       MPI_PROC_NULL succeeds (there is some suspicion that some
       systems can't handle this - also, MPI_REQUEST_NULL 
     */
    /* A null request is an error. (null objects are errors unless otherwise
       allowed)
    r1 = MPI_REQUEST_NULL;
    MPI_Cancel( &r1 );
    */
	MPI_Recv_init( buf, 10, MPI_INT, MPI_PROC_NULL, 0, MPI_COMM_WORLD, &r1 );
	MPI_Start( &r1 );
	MPI_Cancel( &r1 );
	MPI_Request_free( &r1 );    /* Must complete cancel.  We know that it 
				       won't complete, so we don't need to do
				       anything else */
    }

    MPI_Finalize();
    return 0;
}
Beispiel #19
0
/* ************************************************************************ */
static
void
calculate_jacobi (struct calculation_arguments const* arguments, struct calculation_results *results, struct options const* options)
{
    int i, j;                                   /* local variables for loops  */
    int m1, m2;                                 /* used as indices for old and new matrices       */
    double star;                                /* four times center value minus 4 neigh.b values */
    double residuum;                            /* residuum of current iteration                  */
    double maxresiduum;                         /* maximum residuum value of a slave in iteration */

    const int nproc = arguments->nproc;
    const int rank = arguments->rank;

    int const N = arguments->N;
    int const N_global = arguments->N_global;
    double const h = arguments->h;

    int term_iteration = options->term_iteration;

    /* initialize m1 and m2 depending on algorithm */
    if (options->method == METH_JACOBI)
    {
        m1 = 0;
        m2 = 1;
    }
    else
    {
        m1 = 0;
        m2 = 0;
    }

    while (term_iteration > 0)
    {
        double** Matrix_Out = arguments->Matrix[m1];
        double** Matrix_In  = arguments->Matrix[m2];

        maxresiduum = 0;

        /* over all rows */
        for (i = 1; i < N; i++)
        {
            /* over all columns */
            for (j = 1; j < N_global; j++)
            {
                star = 0.25 * (Matrix_In[i-1][j] +
                               Matrix_In[i][j-1] +
                               Matrix_In[i][j+1] +
                               Matrix_In[i+1][j]);

                if (options->inf_func == FUNC_FPISIN)
                {
                    star += (0.25 * TWO_PI_SQUARE * h * h) *
                            sin((PI * h) * ((double)i + arguments->offset)) *
                            sin((PI * h) * (double)j);
                }

                if (options->termination == TERM_PREC || term_iteration == 1)
                {
                    residuum = Matrix_In[i][j] - star;
                    residuum = (residuum < 0) ? -residuum : residuum;
                    maxresiduum = (residuum < maxresiduum) ? maxresiduum : residuum;
                }

                Matrix_Out[i][j] = star;
            }
        }

        // Communicate lines with each other into each other's extra allocated line
        if(rank > 0) {
            MPI_Sendrecv(Matrix_Out[1], N_global, MPI_DOUBLE, rank - 1, rank ,
                         Matrix_Out[0], N_global, MPI_DOUBLE, rank - 1, rank - 1,
                         MPI_COMM_WORLD, NULL);
        }

        // Last rank can't communicate with higher ranks because there aren't any
        if(rank != nproc - 1) {
            MPI_Sendrecv(Matrix_Out[N - 1], N_global, MPI_DOUBLE, rank + 1, rank,
                         Matrix_Out[N], N_global, MPI_DOUBLE, rank + 1, rank + 1,
                         MPI_COMM_WORLD, NULL);
        }

        /* exchange m1 and m2 */
        i = m1;
        m1 = m2;
        m2 = i;

        // Find lowest maxresiduum in whole process swarm
        MPI_Allreduce(MPI_IN_PLACE, &maxresiduum, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);

        results->stat_iteration++;
        results->stat_precision = maxresiduum;

        /* check for stopping calculation, depending on termination method */
        if (options->termination == TERM_PREC)
        {
            if (maxresiduum < options->term_precision)
            {
                term_iteration = 0;
            }
        }
        else if (options->termination == TERM_ITER)
        {
            term_iteration--;
        }
    }

    results->m = m2;
}
Beispiel #20
0
void transpose_start_exchange_step(transpose_mpi_plan p,
				   int el_size,
				   TRANSPOSE_EL_TYPE *local_data,
				   TRANSPOSE_EL_TYPE *send_buf,
				   int step,
				   transpose_sync_type sync_type)
{
     if (p->local_nx > 0 || p->local_ny > 0) {
	  transpose_mpi_exchange *exchange = p->exchange;
	  int block = exchange[step].block_num;
	  int send_block_size = p->send_block_size;
	  int recv_block_size = p->recv_block_size;
	  
          if (exchange[step].dest_pe != p->my_pe) {

	       /* first, copy to send buffer: */
	       if (exchange[step].send_size > 0)
		    memcpy(send_buf,
			   local_data + el_size*send_block_size*block,
			   el_size * exchange[step].send_size *
			   sizeof(TRANSPOSE_EL_TYPE));

#define DO_ISEND  \
               if (exchange[step].send_size > 0) {  \
			 ISEND(send_buf, \
			       exchange[step].send_size * el_size, \
			       p->el_type, \
			       exchange[step].dest_pe, 0, \
			       p->comm, \
			       &p->request[0]); \
	       }
 
	       p->request[0] = MPI_REQUEST_NULL;
	       p->request[1] = MPI_REQUEST_NULL;

	       if (sync_type == TRANSPOSE_ASYNC) {
		    /* Note that we impose an ordering on the sends and
		       receives (lower pe sends first) so that we won't
		       have deadlock if Isend & Irecv are blocking in some
		       MPI implementation: */
	  
		    if (p->my_pe < exchange[step].dest_pe)
			 DO_ISEND;
		    
		    if (exchange[step].recv_size > 0) {
			 MPI_Irecv(local_data + el_size*recv_block_size*block,
				   exchange[step].recv_size * el_size,
				   p->el_type,
				   exchange[step].dest_pe, MPI_ANY_TAG,
				   p->comm,
				   &p->request[1]);
		    }
	       
		    if (p->my_pe > exchange[step].dest_pe)
			 DO_ISEND;
	       }
	       else /* (sync_type == TRANSPOSE_SYNC) */ {
		    MPI_Status status;

		    MPI_Sendrecv(send_buf,
				 exchange[step].send_size * el_size,
				 p->el_type,
				 exchange[step].dest_pe, 0,

				 local_data + el_size*recv_block_size*block,
				 exchange[step].recv_size * el_size,
				 p->el_type,
				 exchange[step].dest_pe, MPI_ANY_TAG,

				 p->comm, &status);
	       }
	  }
	  else if (exchange[step].recv_size > 0 &&
		   recv_block_size != send_block_size)
	       memmove(local_data + el_size*recv_block_size*block,
		       local_data + el_size*send_block_size*block,
		       exchange[step].recv_size * el_size *
		       sizeof(TRANSPOSE_EL_TYPE));
     }
}
int main( int argc, char *argv[] )
{
    int errs = 0;
    int rank, size, source, dest;
    unsigned char *buf, *bufp;
    int minsize = 2; 
    int i, msgsize, bufsize, outsize;
    unsigned char *msg1, *msg2, *msg3;
    MPI_Comm      comm;
    MPI_Status    status1, status2, status3;

    MTest_Init( &argc, &argv );

    /* The following illustrates the use of the routines to 
       run through a selection of communicators and datatypes.
       Use subsets of these for tests that do not involve combinations 
       of communicators, datatypes, and counts of datatypes */
    msgsize = 128 * 1024;
    msg1 = (unsigned char *)malloc( 3 * msgsize );
    msg2 = msg1 + msgsize;
    msg3 = msg2 + msgsize;
    while (MTestGetIntracommGeneral( &comm, minsize, 1 )) {
	if (comm == MPI_COMM_NULL) continue;
	/* Determine the sender and receiver */
	MPI_Comm_rank( comm, &rank );
	MPI_Comm_size( comm, &size );
	source = 0;
	dest   = size - 1;

	/* Here is the test:  The sender */
	if (rank == source) {
	    /* Get a bsend buffer.  Make it large enough that the Bsend
	       internals will (probably) not use a eager send for the data.
	       Have three such messages */
	    bufsize = 3 * (MPI_BSEND_OVERHEAD + msgsize);
	    buf     = (unsigned char *)malloc( bufsize );
	    if (!buf) {
		fprintf( stderr, "Unable to allocate a buffer of %d bytes\n",
			 bufsize );
		MPI_Abort( MPI_COMM_WORLD, 1 );
	    }
	    
	    MPI_Buffer_attach( buf, bufsize );

	    /* Initialize the buffers */
	    for (i=0; i<msgsize; i++) {
		msg1[i] = 0xff ^ (i & 0xff);
		msg2[i] = 0xff ^ (3*i & 0xff);
		msg3[i] = 0xff ^ (5*i & 0xff);
	    }

	    /* Initiate the bsends */
	    MPI_Bsend( msg1, msgsize, MPI_CHAR, dest, 0, comm );
	    MPI_Bsend( msg2, msgsize, MPI_CHAR, dest, 0, comm );
	    MPI_Bsend( msg3, msgsize, MPI_CHAR, dest, 0, comm );

	    /* Synchronize with our partner */
	    MPI_Sendrecv( 0, 0, MPI_CHAR, dest, 10, 
			  0, 0, MPI_CHAR, dest, 10, comm, MPI_STATUS_IGNORE );

	    /* Detach the buffers.  There should be pending operations */
	    MPI_Buffer_detach ( &bufp, &outsize );
	    if (bufp != buf) {
		fprintf( stderr, "Wrong buffer returned\n" );
		errs++;
	    }
	    if (outsize != bufsize) {
		fprintf( stderr, "Wrong buffer size returned\n" );
		errs++;
	    }
	}
	else if (rank == dest) {
	    double tstart;

	    /* Clear the message buffers */
	    for (i=0; i<msgsize; i++) {
		msg1[i] = 0;
		msg2[i] = 0;
		msg3[i] = 0;
	    }

	    /* Wait for the synchronize */
	    MPI_Sendrecv( 0, 0, MPI_CHAR, source, 10, 
			  0, 0, MPI_CHAR, source, 10, comm, MPI_STATUS_IGNORE );

	    /* Wait 2 seconds */
	    tstart = MPI_Wtime();
	    while (MPI_Wtime() - tstart < 2.0) ;

	    /* Now receive the messages */
	    MPI_Recv( msg1, msgsize, MPI_CHAR, source, 0, comm, &status1 );
	    MPI_Recv( msg2, msgsize, MPI_CHAR, source, 0, comm, &status2 );
	    MPI_Recv( msg3, msgsize, MPI_CHAR, source, 0, comm, &status3 );

	    /* Check that we have the correct data */
	    for (i=0; i<msgsize; i++) {
		if (msg1[i] != (0xff ^ (i & 0xff))) { 
		    if (errs < 10) {
			fprintf( stderr, "msg1[%d] = %d\n", i, msg1[i] );
		    }
		    errs++;
		}
		if (msg2[i] != (0xff ^ (3*i & 0xff))) {
		    if (errs < 10) {
			fprintf( stderr, "msg2[%d] = %d\n", i, msg2[i] );
		    }
		    errs++;
		}
		if (msg3[i] != (0xff ^ (5*i & 0xff))) {
		    if (errs < 10) {
			fprintf( stderr, "msg2[%d] = %d\n", i, msg2[i] );
		    }
		    errs++;
		}
	    }
	    
	}
		
	
	MTestFreeComm( &comm );
    }
    free( msg1 );

    MTest_Finalize( errs );
    MPI_Finalize();
    return 0;
}
Beispiel #22
0
void bsmooth(void)
{
  long long ntot, ntotleft;
  int *noffset, *nbuffer, *nsend, *nsend_local, *ndonelist, *numlist;
  int i, j, n;
  int ndone;
  int maxfill, source;
  int level, ngrp, sendTask, recvTask;
  int place, nexport;
  double tstart, tend;
  double maxt, sumt, sumcomm;
  double timecomp = 0, timeimbalance = 0, timecommsumm = 0;
  double timengb, sumtimengb;
  MPI_Status status;

  int Smooth_Flag = 0;
  double dB;

  if(ThisTask == 0)
    printf("Flag_FullStep = %d, Main TimestepCounts = %d\n", Flag_FullStep, All.MainTimestepCounts);
  if(Flag_FullStep == 1)
    {
      if((All.MainTimestepCounts % All.BSmoothInt == 0) && (All.BSmoothInt >= 0))
	{
	  Smooth_Flag = 1;
	  if(ThisTask == 0)
	    printf("Smoothing B %d, %f\n", All.BSmoothInt, All.BSmoothFrac);
	}
      All.MainTimestepCounts++;
    }

  noffset = mymalloc(sizeof(int) * NTask);	/* offsets of bunches in common list */
  nbuffer = mymalloc(sizeof(int) * NTask);
  nsend_local = mymalloc(sizeof(int) * NTask);
  nsend = mymalloc(sizeof(int) * NTask * NTask);
  ndonelist = mymalloc(sizeof(int) * NTask);

  for(n = 0, NumSphUpdate = 0; n < N_gas; n++)
    {
#ifdef SFR
      if(P[n].Type == 0)
#endif
	{
	  SphP[n].Left = SphP[n].Right = 0;

	  if(P[n].Ti_endstep == All.Ti_Current)
	    NumSphUpdate++;
	}
    }

  numlist = mymalloc(NTask * sizeof(int) * NTask);
  MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD);
  for(i = 0, ntot = 0; i < NTask; i++)
    ntot += numlist[i];
  myfree(numlist);

  i = 0;			/* beginn with this index */
  ntotleft = ntot;		/* particles left for all tasks together */

  while(ntotleft > 0)
    {
      for(j = 0; j < NTask; j++)
	nsend_local[j] = 0;

      /* do local particles and prepare export list */
      tstart = second();
      for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeDensity - NTask; i++)
#ifdef SFR
	if(P[i].Type == 0)
#endif
	  if(P[i].Ti_endstep == All.Ti_Current)
	    {
	      ndone++;

	      for(j = 0; j < NTask; j++)
		Exportflag[j] = 0;

	      bsmooth_evaluate(i, 0);

	      for(j = 0; j < NTask; j++)
		{
		  if(Exportflag[j])
		    {
		      DensDataIn[nexport].Pos[0] = P[i].Pos[0];
		      DensDataIn[nexport].Pos[1] = P[i].Pos[1];
		      DensDataIn[nexport].Pos[2] = P[i].Pos[2];
		      DensDataIn[nexport].Hsml = PPP[i].Hsml;
		      DensDataIn[nexport].Index = i;
		      DensDataIn[nexport].Task = j;
		      nexport++;
		      nsend_local[j]++;
		    }
		}
	    }
      tend = second();
      timecomp += timediff(tstart, tend);

      qsort(DensDataIn, nexport, sizeof(struct densdata_in), dens_compare_key);

      for(j = 1, noffset[0] = 0; j < NTask; j++)
	noffset[j] = noffset[j - 1] + nsend_local[j - 1];

      tstart = second();

      MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD);

      tend = second();
      timeimbalance += timediff(tstart, tend);


      /* now do the particles that need to be exported */

      for(level = 1; level < (1 << PTask); level++)
	{
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeForce)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&DensDataIn[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct densdata_in), MPI_BYTE,
				   recvTask, TAG_BSMTH_A,
				   &DensDataGet[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_in),
				   MPI_BYTE, recvTask, TAG_BSMTH_A, MPI_COMM_WORLD, &status);
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);


	  tstart = second();
	  for(j = 0; j < nbuffer[ThisTask]; j++)
	    {
	      bsmooth_evaluate(j, 1);
	    }
	  tend = second();
	  timecomp += timediff(tstart, tend);

	  /* do a block to measure imbalance */
	  tstart = second();
	  MPI_Barrier(MPI_COMM_WORLD);
	  tend = second();
	  timeimbalance += timediff(tstart, tend);

	  /* get the result */
	  tstart = second();
	  for(j = 0; j < NTask; j++)
	    nbuffer[j] = 0;
	  for(ngrp = level; ngrp < (1 << PTask); ngrp++)
	    {
	      maxfill = 0;
	      for(j = 0; j < NTask; j++)
		{
		  if((j ^ ngrp) < NTask)
		    if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j])
		      maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j];
		}
	      if(maxfill >= All.BunchSizeForce)
		break;

	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&DensDataResult[nbuffer[ThisTask]],
				   nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_out),
				   MPI_BYTE, recvTask, TAG_BSMTH_B,
				   &DensDataPartialResult[noffset[recvTask]],
				   nsend_local[recvTask] * sizeof(struct densdata_out),
				   MPI_BYTE, recvTask, TAG_BSMTH_B, MPI_COMM_WORLD, &status);

		      /* add the result to the particles */
		      for(j = 0; j < nsend_local[recvTask]; j++)
			{
			  source = j + noffset[recvTask];
			  place = DensDataIn[source].Index;

			  SphP[place].BSmooth[0] += DensDataPartialResult[source].BSmooth[0];
			  SphP[place].BSmooth[1] += DensDataPartialResult[source].BSmooth[1];
			  SphP[place].BSmooth[2] += DensDataPartialResult[source].BSmooth[2];
			  SphP[place].DensityNorm += DensDataPartialResult[source].DensityNorm;
			}
		    }
		}

	      for(j = 0; j < NTask; j++)
		if((j ^ ngrp) < NTask)
		  nbuffer[j] += nsend[(j ^ ngrp) * NTask + j];
	    }
	  tend = second();
	  timecommsumm += timediff(tstart, tend);

	  level = ngrp - 1;
	}

      MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD);
      for(j = 0; j < NTask; j++)
	ntotleft -= ndonelist[j];
    }



  /* do final operations on results */
  tstart = second();
  for(i = 0; i < N_gas; i++)
    {
#ifdef SFR
      if(P[i].Type == 0)
#endif
	if(P[i].Ti_endstep == All.Ti_Current)
	  {
	    SphP[i].BSmooth[0] /= SphP[i].DensityNorm;
	    SphP[i].BSmooth[1] /= SphP[i].DensityNorm;
	    SphP[i].BSmooth[2] /= SphP[i].DensityNorm;
	    if(Smooth_Flag == 1)
	      {
		dB = All.BSmoothFrac * (SphP[i].BSmooth[0] - SphP[i].BPred[0]);
		SphP[i].BPred[0] += dB;
		SphP[i].B[0] += dB;
		dB = All.BSmoothFrac * (SphP[i].BSmooth[1] - SphP[i].BPred[1]);
		SphP[i].BPred[1] += dB;
		SphP[i].B[1] += dB;
		dB = All.BSmoothFrac * (SphP[i].BSmooth[2] - SphP[i].BPred[2]);
		SphP[i].BPred[2] += dB;
		SphP[i].B[2] += dB;
	      }
	  }
    }
  tend = second();
  timecomp += timediff(tstart, tend);

  myfree(ndonelist);
  myfree(nsend);
  myfree(nsend_local);
  myfree(nbuffer);
  myfree(noffset);


  /* collect some timing information */

  timengb = 0;

  MPI_Reduce(&timengb, &sumtimengb, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
  MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

  timeimbalance += timecomp + timecommsumm;
  MPI_Reduce(&timeimbalance, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);

  if(ThisTask == 0)
    {
      All.CPU_HydCompWalk += sumt / NTask;
      All.CPU_HydCommSumm += sumcomm / NTask;
      All.CPU_HydImbalance += maxt - (sumt + sumcomm) / NTask;
      All.CPU_EnsureNgb += sumtimengb / NTask;
    }
}
Beispiel #23
0
/*! Calculates the long-range potential using the PM method.  The potential is
 *  Gaussian filtered with Asmth, given in mesh-cell units. We carry out a CIC
 *  charge assignment, and compute the potenial by Fourier transform
 *  methods. The CIC kernel is deconvolved.
 */
void pmpotential_periodic(void)
{
  double k2, kx, ky, kz, smth;
  double dx, dy, dz;
  double fx, fy, fz, ff;
  double asmth2, fac;
  int i, j, slab, level, sendTask, recvTask;
  int x, y, z, ip;
  int slab_x, slab_y, slab_z;
  int slab_xx, slab_yy, slab_zz;
  int meshmin[3], meshmax[3], sendmin, sendmax, recvmin, recvmax;
  int rep, ncont, cont_sendmin[2], cont_sendmax[2], cont_recvmin[2], cont_recvmax[2];
  int dimx, dimy, dimz, recv_dimx, recv_dimy, recv_dimz;
  MPI_Status status;

  if(ThisTask == 0)
    {
      printf("Starting periodic PM calculation.\n");
      fflush(stdout);
    }

  asmth2 = (2 * M_PI) * All.Asmth[0] / All.BoxSize;
  asmth2 *= asmth2;

  fac = All.G / (M_PI * All.BoxSize);	/* to get potential */

  force_treefree();

  /* first, establish the extension of the local patch in the PMGRID  */

  for(j = 0; j < 3; j++)
    {
      meshmin[j] = PMGRID;
      meshmax[j] = 0;
    }

  for(i = 0; i < NumPart; i++)
    {
      for(j = 0; j < 3; j++)
	{
	  slab = to_slab_fac * P[i].Pos[j];
	  if(slab >= PMGRID)
	    slab = PMGRID - 1;

	  if(slab < meshmin[j])
	    meshmin[j] = slab;

	  if(slab > meshmax[j])
	    meshmax[j] = slab;
	}
    }

  MPI_Allgather(meshmin, 3, MPI_INT, meshmin_list, 3, MPI_INT, MPI_COMM_WORLD);
  MPI_Allgather(meshmax, 3, MPI_INT, meshmax_list, 3, MPI_INT, MPI_COMM_WORLD);

  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  pm_init_periodic_allocate((dimx + 4) * (dimy + 4) * (dimz + 4));

  #ifdef FFTW3
  /* Create plan for in-place r2c DFT */
  fft_forward_plan = fftw_mpi_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid, fft_of_rhogrid,
  												MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT);
  fft_inverse_plan = fftw_mpi_plan_dft_c2r_3d(PMGRID, PMGRID, PMGRID, fft_of_rhogrid, rhogrid,
  												MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN);
  #endif

  for(i = 0; i < dimx * dimy * dimz; i++)
    workspace[i] = 0;

  for(i = 0; i < NumPart; i++)
    {
      slab_x = to_slab_fac * P[i].Pos[0];
      if(slab_x >= PMGRID)
	slab_x = PMGRID - 1;
      dx = to_slab_fac * P[i].Pos[0] - slab_x;
      slab_x -= meshmin[0];
      slab_xx = slab_x + 1;

      slab_y = to_slab_fac * P[i].Pos[1];
      if(slab_y >= PMGRID)
	slab_y = PMGRID - 1;
      dy = to_slab_fac * P[i].Pos[1] - slab_y;
      slab_y -= meshmin[1];
      slab_yy = slab_y + 1;

      slab_z = to_slab_fac * P[i].Pos[2];
      if(slab_z >= PMGRID)
	slab_z = PMGRID - 1;
      dz = to_slab_fac * P[i].Pos[2] - slab_z;
      slab_z -= meshmin[2];
      slab_zz = slab_z + 1;

      workspace[(slab_x * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * dy * (1.0 - dz);
      workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * dz;
      workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * dy * dz;

      workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (dx) * (1.0 - dy) * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (dx) * dy * (1.0 - dz);
      workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (dx) * (1.0 - dy) * dz;
      workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (dx) * dy * dz;
    }


  for(i = 0; i < fftsize; i++)	/* clear local density field */
    rhogrid[i] = 0;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;
      if(recvTask < NTask)
	{
	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -1;
	  for(slab_x = meshmin[0]; slab_x < meshmax[0] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == recvTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -1)
	    sendmin = 0;

	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -1;
	  for(slab_x = meshmin_list[3 * recvTask]; slab_x < meshmax_list[3 * recvTask] + 2; slab_x++)
	    if(slab_to_task[slab_x % PMGRID] == sendTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -1)
	    recvmin = 0;


	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 2;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 2;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 2;

	      if(level > 0)
		{
		  MPI_Sendrecv(workspace + (sendmin - meshmin[0]) * dimy * dimz,
			       (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask,
			       TAG_PERIODIC_C, forcegrid,
			       (recvmax - recvmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE,
			       recvTask, TAG_PERIODIC_C, MPI_COMM_WORLD, &status);
		}
	      else
		{
		  memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz,
			 (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real));
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  slab_xx = (slab_x % PMGRID) - first_slab_of_task[ThisTask];

		  if(slab_xx >= 0 && slab_xx < slabs_per_task[ThisTask])
		    {
		      for(slab_y = meshmin_list[3 * recvTask + 1];
			  slab_y <= meshmax_list[3 * recvTask + 1] + 1; slab_y++)
			{
			  slab_yy = slab_y;
			  if(slab_yy >= PMGRID)
			    slab_yy -= PMGRID;

			  for(slab_z = meshmin_list[3 * recvTask + 2];
			      slab_z <= meshmax_list[3 * recvTask + 2] + 1; slab_z++)
			    {
			      slab_zz = slab_z;
			      if(slab_zz >= PMGRID)
				slab_zz -= PMGRID;

			      rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] +=
				forcegrid[((slab_x - recvmin) * recv_dimy +
					   (slab_y - meshmin_list[3 * recvTask + 1])) * recv_dimz +
					  (slab_z - meshmin_list[3 * recvTask + 2])];
			    }
			}
		    }
		}
	    }
	}
    }



  /* Do the FFT of the density field */
  #ifdef FFTW3
  fftw_execute(fft_forward_plan);
  #else
  rfftwnd_mpi(fft_forward_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif
  
  /* multiply with Green's function for the potential */

  for(y = slabstart_y; y < slabstart_y + nslab_y; y++)
    for(x = 0; x < PMGRID; x++)
      for(z = 0; z < PMGRID / 2 + 1; z++)
	{
	  if(x > PMGRID / 2)
	    kx = x - PMGRID;
	  else
	    kx = x;
	  if(y > PMGRID / 2)
	    ky = y - PMGRID;
	  else
	    ky = y;
	  if(z > PMGRID / 2)
	    kz = z - PMGRID;
	  else
	    kz = z;

	  k2 = kx * kx + ky * ky + kz * kz;

	  if(k2 > 0)
	    {
	      smth = -exp(-k2 * asmth2) / k2 * fac;
	      /* do deconvolution */
	      fx = fy = fz = 1;
	      if(kx != 0)
		{
		  fx = (M_PI * kx) / PMGRID;
		  fx = sin(fx) / fx;
		}
	      if(ky != 0)
		{
		  fy = (M_PI * ky) / PMGRID;
		  fy = sin(fy) / fy;
		}
	      if(kz != 0)
		{
		  fz = (M_PI * kz) / PMGRID;
		  fz = sin(fz) / fz;
		}
	      ff = 1 / (fx * fy * fz);
	      smth *= ff * ff * ff * ff;
	      /* end deconvolution */

	      ip = PMGRID * (PMGRID / 2 + 1) * (y - slabstart_y) + (PMGRID / 2 + 1) * x + z;
	      c_re(fft_of_rhogrid[ip]) *= smth;
	      c_im(fft_of_rhogrid[ip]) *= smth;
	    }
	}

  if(slabstart_y == 0)
    c_re(fft_of_rhogrid[0]) = c_im(fft_of_rhogrid[0]) = 0.0;

  /* Do the FFT to get the potential */
  #ifdef FFTW3
  fftw_execute(fft_inverse_plan);
  
  /* Now normalize the output */
  for(i = 0; i < fftsize; i++)
  	rhogrid[i] = rhogrid[i] / (PMGRID*PMGRID*PMGRID);
  	
  #else
  rfftwnd_mpi(fft_inverse_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER);
  #endif
  
  /* note: "rhogrid" now contains the potential */



  dimx = meshmax[0] - meshmin[0] + 6;
  dimy = meshmax[1] - meshmin[1] + 6;
  dimz = meshmax[2] - meshmin[2] + 6;

  for(level = 0; level < (1 << PTask); level++)	/* note: for level=0, target is the same task */
    {
      sendTask = ThisTask;
      recvTask = ThisTask ^ level;

      if(recvTask < NTask)
	{

	  /* check how much we have to send */
	  sendmin = 2 * PMGRID;
	  sendmax = -PMGRID;
	  for(slab_x = meshmin_list[3 * recvTask] - 2; slab_x < meshmax_list[3 * recvTask] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == sendTask)
	      {
		if(slab_x < sendmin)
		  sendmin = slab_x;
		if(slab_x > sendmax)
		  sendmax = slab_x;
	      }
	  if(sendmax == -PMGRID)
	    sendmin = sendmax + 1;


	  /* check how much we have to receive */
	  recvmin = 2 * PMGRID;
	  recvmax = -PMGRID;
	  for(slab_x = meshmin[0] - 2; slab_x < meshmax[0] + 4; slab_x++)
	    if(slab_to_task[(slab_x + PMGRID) % PMGRID] == recvTask)
	      {
		if(slab_x < recvmin)
		  recvmin = slab_x;
		if(slab_x > recvmax)
		  recvmax = slab_x;
	      }
	  if(recvmax == -PMGRID)
	    recvmin = recvmax + 1;

	  if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0)	/* ok, we have a contribution to the slab */
	    {
	      recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 6;
	      recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 6;
	      recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 6;

	      ncont = 1;
	      cont_sendmin[0] = sendmin;
	      cont_sendmax[0] = sendmax;
	      cont_sendmin[1] = sendmax + 1;
	      cont_sendmax[1] = sendmax;

	      cont_recvmin[0] = recvmin;
	      cont_recvmax[0] = recvmax;
	      cont_recvmin[1] = recvmax + 1;
	      cont_recvmax[1] = recvmax;

	      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
		    {
		      /* non-contiguous */
		      cont_sendmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask)
			slab_x++;
		      cont_sendmin[1] = slab_x;
		      ncont++;
		    }
		}

	      for(slab_x = recvmin; slab_x <= recvmax; slab_x++)
		{
		  if(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
		    {
		      /* non-contiguous */
		      cont_recvmax[0] = slab_x - 1;
		      while(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask)
			slab_x++;
		      cont_recvmin[1] = slab_x;
		      if(ncont == 1)
			ncont++;
		    }
		}


	      for(rep = 0; rep < ncont; rep++)
		{
		  sendmin = cont_sendmin[rep];
		  sendmax = cont_sendmax[rep];
		  recvmin = cont_recvmin[rep];
		  recvmax = cont_recvmax[rep];

		  /* prepare what we want to send */
		  if(sendmax - sendmin >= 0)
		    {
		      for(slab_x = sendmin; slab_x <= sendmax; slab_x++)
			{
			  slab_xx = ((slab_x + PMGRID) % PMGRID) - first_slab_of_task[ThisTask];

			  for(slab_y = meshmin_list[3 * recvTask + 1] - 2;
			      slab_y < meshmax_list[3 * recvTask + 1] + 4; slab_y++)
			    {
			      slab_yy = (slab_y + PMGRID) % PMGRID;

			      for(slab_z = meshmin_list[3 * recvTask + 2] - 2;
				  slab_z < meshmax_list[3 * recvTask + 2] + 4; slab_z++)
				{
				  slab_zz = (slab_z + PMGRID) % PMGRID;

				  forcegrid[((slab_x - sendmin) * recv_dimy +
					     (slab_y - (meshmin_list[3 * recvTask + 1] - 2))) * recv_dimz +
					    slab_z - (meshmin_list[3 * recvTask + 2] - 2)] =
				    rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz];
				}
			    }
			}
		    }

		  if(level > 0)
		    {
		      MPI_Sendrecv(forcegrid,
				   (sendmax - sendmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real),
				   MPI_BYTE, recvTask, TAG_PERIODIC_D,
				   workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
				   (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE,
				   recvTask, TAG_PERIODIC_D, MPI_COMM_WORLD, &status);
		    }
		  else
		    {
		      memcpy(workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz,
			     forcegrid, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real));
		    }
		}
	    }
	}
    }


  dimx = meshmax[0] - meshmin[0] + 2;
  dimy = meshmax[1] - meshmin[1] + 2;
  dimz = meshmax[2] - meshmin[2] + 2;

  recv_dimx = meshmax[0] - meshmin[0] + 6;
  recv_dimy = meshmax[1] - meshmin[1] + 6;
  recv_dimz = meshmax[2] - meshmin[2] + 6;



  for(x = 0; x < meshmax[0] - meshmin[0] + 2; x++)
    for(y = 0; y < meshmax[1] - meshmin[1] + 2; y++)
      for(z = 0; z < meshmax[2] - meshmin[2] + 2; z++)
	{
	  forcegrid[(x * dimy + y) * dimz + z] =
	    workspace[((x + 2) * recv_dimy + (y + 2)) * recv_dimz + (z + 2)];
	}


  /* read out the potential */

  for(i = 0; i < NumPart; i++)
    {
      slab_x = to_slab_fac * P[i].Pos[0];
      if(slab_x >= PMGRID)
	slab_x = PMGRID - 1;
      dx = to_slab_fac * P[i].Pos[0] - slab_x;
      slab_x -= meshmin[0];
      slab_xx = slab_x + 1;

      slab_y = to_slab_fac * P[i].Pos[1];
      if(slab_y >= PMGRID)
	slab_y = PMGRID - 1;
      dy = to_slab_fac * P[i].Pos[1] - slab_y;
      slab_y -= meshmin[1];
      slab_yy = slab_y + 1;

      slab_z = to_slab_fac * P[i].Pos[2];
      if(slab_z >= PMGRID)
	slab_z = PMGRID - 1;
      dz = to_slab_fac * P[i].Pos[2] - slab_z;
      slab_z -= meshmin[2];
      slab_zz = slab_z + 1;

      P[i].Potential +=
	forcegrid[(slab_x * dimy + slab_y) * dimz + slab_z] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_z] * (1.0 - dx) * dy * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_zz] * (1.0 - dx) * (1.0 - dy) * dz;
      P[i].Potential += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_zz] * (1.0 - dx) * dy * dz;

      P[i].Potential += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_z] * (dx) * (1.0 - dy) * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_z] * (dx) * dy * (1.0 - dz);
      P[i].Potential += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_zz] * (dx) * (1.0 - dy) * dz;
      P[i].Potential += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_zz] * (dx) * dy * dz;
    }

  pm_init_periodic_free();
  force_treeallocate(All.TreeAllocFactor * All.MaxPart, All.MaxPart);

  All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency;

  if(ThisTask == 0)
    {
      printf("done PM-Potential.\n");
      fflush(stdout);
    }
}
Beispiel #24
0
void MODEL::Boundary()
{
  char text[200];

  // initializations ---------------------------------------------------------------------

  if( !boundList )
  {
    boundList = new ELEM* [region->Getnp()];
    if( !boundList )
      REPORT::rpt.Error( kMemoryFault, "can not allocate memory - MODEL::Boundary(1)" );
  }

  for( int n=0; n<region->Getnp(); n++ )
  {
    NODE* nd = region->Getnode(n);

    CF( nd->flag, NODE::kBound );

    nd->mark = false;


    // remove slip boundary conditions ---------------------------------------------------

    if( isFS(nd->bc.kind, BCON::kAutoSlip) )
    {
      CF( nd->bc.kind, BCON::kAutoSlip | BCON::kFixU | BCON::kFixV );
    }

    if( isFS(nd->bc.kind, BCON::kAutoKD) )
    {
      CF( nd->bc.kind, BCON::kAutoKD | BCON::kFixK | BCON::kFixD );
    }
  }


  // determine boundary midside nodes ----------------------------------------------------

  for( int n=0; n<region->Getnp(); n++ )
  {
    NODE* nd = region->Getnode(n);

    // midside nodes which are connected to only one element are boundary nodes ----------

    if( nd->noel == 1  &&  isFS(nd->flag, NODE::kMidsNode) )
    {
      if( !isFS(nd->flag, NODE::kInface) )  SF( nd->flag, NODE::kBound );
    }
  }


  // determine number of boundary elements: nb -------------------------------------------

  int nb = 0;

  for( int n=0; n<region->Getnp(); n++ )
  {
    if( isFS(region->Getnode(n)->flag,NODE::kBound) )  nb++;
  }


  // allocate memory for boundary elements -----------------------------------------------

  bound->Free();
  bound->Alloc( 0, nb );


  // set up boundary elements ------------------------------------------------------------

  int be = 0;       // counter for boundary elements

  for( int re=0; re<region->Getne(); re++ )
  {
    ELEM* el = region->Getelem(re);

    if( isFS(el->flag, ELEM::kDry) )  continue;

    int ncn = el->Getncn();
    int nnd = el->Getnnd();

    for( int i=ncn; i<nnd; i++ )
    {
      // check, if el->nd[i] is a midside boundary node ----------------------------------

      if( isFS(el->nd[i]->flag, NODE::kBound) )
      {
        ELEM* bd = bound->Getelem(be);

        boundList[el->nd[i]->Getno()] = bd;

        int left = i - ncn;
        int rght = (left + 1) % ncn;

        bd->nd[0] = el->nd[left];           // corner nodes
        bd->nd[1] = el->nd[rght];
        bd->nd[2] = el->nd[i];              // midside node

        SF( bd->nd[0]->flag, NODE::kBound );
        SF( bd->nd[1]->flag, NODE::kBound );


        // set shape specifications ------------------------------------------------------

        bd->Setshape( kLine );
        bd->Setname( el->Getname() );

        SF( bd->flag, ELEM::kBound );

        bd->type     = el->type;
        bd->areaFact = 1.0;

        be++;
      }
    }
  }


  ////////////////////////////////////////////////////////////////////////////////////////
  // communicate boundary nodes

//# ifdef _MPI_DBG
//  REPORT::rpt.Output( " (MODEL::Boundary)       communication of boundary nodes", 1 );
//# endif

# ifdef _MPI_
  if( subdom->npr > 1 )
  {
    INFACE* inface = subdom->inface;

    // loop on all interfaces: exchange bound flag ---------------------------------------
    for( int s=0; s<subdom->npr; s++ )
    {
      MPI_Status status;

      int npinf = inface[s].np;

      if( npinf > 0 )
      {
        for( int n=0; n<npinf; n++ )
        {
          NODE* nd = inface[s].node[n];

          if( isFS(nd->flag, NODE::kBound) )  inface[s].sia1[n] = true;
          else                                inface[s].sia1[n] = false;
        }

        MPI_Sendrecv( inface[s].sia1, npinf, MPI_CHAR, s, 1,
                      inface[s].ria1, npinf, MPI_CHAR, s, 1,
                      MPI_COMM_WORLD, &status );

        for( int n=0; n<npinf; n++ )
        {
          NODE* nd = inface[s].node[n];
          if( inface[s].ria1[n] )  SF( nd->flag, NODE::kBound );
        }
      }
    }
  }
# endif
  ////////////////////////////////////////////////////////////////////////////////////////


  // -------------------------------------------------------------------------------------
  // count for newly required boundary conditions
  // note: (sc, 30.10.2004)
  // a boundary condition is needed for marsh-nodes in case of dry-rewet-method 3

  int nbc = 0;

  for( int n=0; n<region->Getnp(); n++ )
  {
    NODE* nd = region->Getnode(n);

    if( isFS(nd->flag, NODE::kBound) )  nbc++;
  }

  sprintf( text,"\n (MODEL::Boundary)       number of boundary elements: %d\n", nb );
  REPORT::rpt.Output( text, 3 );


# ifdef kDebug
{
  int pid;
  MPI_Comm_rank( MPI_COMM_WORLD, &pid );

  char fname[40];
  sprintf( fname, "bound_%02d.inp", pid+1 );

  FILE* id = fopen( fname, "w" );

  for( int n=0; n<region->Getnp(); n++ )
  {
    NODE* nd = region->Getnode(n);

    CF( nd->flag, NODE::kMarker );
  }


  for( int e=0; e<bound->Getne(); e++ )
  {
    ELEM* el = bound->Getelem(e);

    for( int i=0; i<el->getnnd(); i++ )
    {
      SF( el->nd[i]->flag, NODE::kMarker );
    }
  }


  int j = 0;

  for( int n=0; n<region->Getnp(); n++ )
  {
    NODE* nd = region->Getnode(n);

    if( isFS(nd->flag, NODE::kMarker) )  j++;
  }


  fprintf( id, "%6d  %6d   0   0   0\n", j, nb );


  for( int n=0; n<region->Getnp(); n++ )
  {
    NODE* nd = region->Getnode(n);

    if( isFS(nd->flag, NODE::kMarker) )
    {
      fprintf( id, "%6d  %17.9le  %17.9le  %17.9le\n",
                   nd->Getname(), nd->x, nd->y, nd->z );
    }
  }

  for( int e=0; e<bound->Getne(); e++ )
  {
    ELEM* el = bound->Getelem(e);

    fprintf( id, "%6d  %3d  line   %6d  %6d  %6d\n",
                 el->Getname(), TYPE::getid(el->type),
                 el->nd[0]->Getname(), el->nd[1]->Getname(), el->nd[2]->Getname() );
  }

  fclose( id );
}
# endif
}
//removes the lowest energy vertical seam from the image
void removeHorizontalSeam() {
    double energies[3];
    double min_energy;
    int prev_x;
    int prev_y;
    
    // split up work between processes
    double *my_path_costs;
    double *my_previous_x;
    double *my_previous_y;
    double *temp_path_costs;
    double *temp_previous_x;
    double *temp_previous_y;
    int my_rows = current_height / numprocs;
    int low_rows = my_rows;
    int extra_rows = current_height % numprocs;
    int start;
    int y_offset;
    int recv_rows;

    double top_end_cost, bottom_end_cost, temp_end_cost;

    if (rank < extra_rows) {
        my_rows++;
        start = rank * my_rows;
    } else {
        start = (extra_rows * (my_rows + 1)) + ((rank - extra_rows) * my_rows);
    }
    
    my_path_costs = (double *) malloc(my_rows * current_width * sizeof(double));
    my_previous_x = (double *) malloc(my_rows * current_width * sizeof(double));
    my_previous_y = (double *) malloc(my_rows * current_width * sizeof(double));
    //find the lowest cost seam by computing the lowest cost paths to each pixel
    for (int x = 0; x < current_width; x++) {
        //compute the path costs for my rows
        for (int y = start; y < start + my_rows; y++) {

            if (x == 0) {
                path_costs[x * initial_height + y] = image_energy[x * initial_height + y];
                my_path_costs[(y - start) * current_width + x] = path_costs[x * initial_height + y];

                previous_x[x * initial_height + y] = -1;
                my_previous_x[(y - start) * current_width + x] = previous_x[x * initial_height + y];

                previous_y[x * initial_height + y] = -1;
                my_previous_y[(y - start) * current_width + x] = previous_y[x * initial_height + y];
            } else {
                //the pixel directly left
                energies[1] = path_costs[(x - 1) * initial_height + y];
                //pixel left and above
                if (y != 0) {
                    energies[0] = path_costs[(x - 1) * initial_height + y - 1];
                } else {
                    energies[0] = DBL_MAX;
                }
                //pixel left and below
                if (y != current_height - 1) {
                    energies[2] = path_costs[(x - 1) * initial_height + y + 1];
                } else {
                    energies[2] = DBL_MAX;
                }

                //find the one with the least path cost
                min_energy = energies[0];
                prev_x = x - 1;
                prev_y = y - 1;
                if (energies[1] < min_energy) {
                    min_energy = energies[1];
                    prev_y = y;
                }
                if (energies[2] < min_energy) {
                    min_energy = energies[2];
                    prev_y = y + 1;
                }

                //set the minimum path cost for this pixel
                path_costs[x * initial_height + y] = min_energy + image_energy[x * initial_height + y];
                my_path_costs[(y - start) * current_width + x] = path_costs[x * initial_height + y];

                //set the previous pixel on the minimum path's coordinates for this pixel
                previous_x[x * initial_height + y] = prev_x;
                my_previous_x[(y - start) * current_width + x] = previous_x[x * initial_height + y];

                previous_y[x * initial_height + y] = prev_y;
                my_previous_y[(y - start) * current_width + x] = previous_y[x * initial_height + y];
            }
        }

        //send path cost needed to neighboring processes
        if (numprocs > 1) {
            if (rank != numprocs - 1) {
                //send bottom most cost to following process
                bottom_end_cost = path_costs[x * initial_height + (start + my_rows - 1)];        
                MPI_Send(&bottom_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD);

                //receive following process's top most cost
                MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                path_costs[x * initial_height + (start + my_rows)] = temp_end_cost;
            }
            if (rank != 0) {
                //send top most cost to preceding process
                top_end_cost = path_costs[x * initial_height + start];
                MPI_Send(&top_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD);

                //receive preceding process's bottom most cost
                MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
                path_costs[x * initial_height + (start - 1)] = temp_end_cost;
            }            
        }
    }

    //update paths costs for all processes
    for (int i = 0; i < numprocs; i++) {
        if (rank == i) {
            continue;
        }

        if (i < extra_rows) {
            y_offset = i * (low_rows + 1);
            recv_rows = low_rows + 1;
        } else {
            y_offset = (extra_rows * (low_rows + 1)) + ((i - extra_rows) * low_rows);
            recv_rows = low_rows;
        }

        //printf("%d %d\n", low_rows, extra_rows);
        //printf("%d %d %d\n", rank, y_offset, recv_rows);

        temp_path_costs = (double *) malloc(recv_rows * current_width * sizeof(double));
        temp_previous_x = (double *) malloc(recv_rows * current_width * sizeof(double));
        temp_previous_y = (double *) malloc(recv_rows * current_width * sizeof(double));
        MPI_Sendrecv(my_path_costs, my_rows * current_width, MPI_DOUBLE, i, 0, 
            temp_path_costs, recv_rows * current_width, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, 
            MPI_STATUS_IGNORE);
        MPI_Sendrecv(my_previous_x, my_rows * current_width, MPI_DOUBLE, i, 1, 
            temp_previous_x, recv_rows * current_width, MPI_DOUBLE, i, 1, MPI_COMM_WORLD, 
            MPI_STATUS_IGNORE);
        MPI_Sendrecv(my_previous_y, my_rows * current_width, MPI_DOUBLE, i, 2, 
            temp_previous_y, recv_rows * current_width, MPI_DOUBLE, i, 2, MPI_COMM_WORLD, 
            MPI_STATUS_IGNORE);
    
        for (int j = 0; j < recv_rows; j++) {
            int x = j / recv_rows;
            int y = y_offset + (j % recv_rows);
            //printf("%d %d %d %d %d\n", rank, x, y, x * initial_height + y, recv_rows * current_width);
            //printf("%d\n", initial_height * initial_width);
            path_costs[x * initial_height + y] = temp_path_costs[(y - y_offset) * current_width + x];
            previous_x[x * initial_height + y] = temp_previous_x[(y - y_offset) * current_width + x];
            previous_y[x * initial_height + y] = temp_previous_y[(y - y_offset) * current_width + x];
        }
        free(temp_path_costs);
        free(temp_previous_x);
        free(temp_previous_y);
    }
    free(my_path_costs);
    free(my_previous_x);
    free(my_previous_y);

    //find the ycoord the lowest cost seam starts at the right of the current image
    int y_coord = 0;
    for (int y = 0; y < current_height; y++) {
        if (path_costs[(current_width - 1) * initial_height + y] < path_costs[(current_width - 1) * initial_height + y_coord]) {
            y_coord = y;
        }
    }

    //delete the seam from right to left
    for (int x = current_width - 1; x >= 0; x--) {
        //delete this pixel by copying over it and all those following to the bottom
        for (int y = y_coord; y < current_height - 1; y++) {
            image[x * initial_height + y] = image[x * initial_height + y + 1];
        }
        //next pixel
        y_coord = previous_y[x * initial_height + y_coord];
    }

    //decrease the current height of the image
    current_height--;
}
Beispiel #26
0
/**
MPI communication routine for exchanging (double-precision) values 
of particle distribution functions across the boundaries
between different MPI processes (or MPI ranks)

After calling this function, values in the ghost layers for {f0, f1, ..., f18}
get updated using values from neighboring MPI processes
*/
void exchangePDF (const int      nn,                // number of ghost cell layers
                  const int      Q,                 // number of LBM streaming directions
                  const int      MX,                // number of voxels along X in this process
                  const int      MY,                // number of voxels along Y in this process
                  const int      MZ,                // number of voxels along Z in this process
                  const int      myid,              // my process id
                  const MPI_Comm CART_COMM,         // Cartesian topology communicator
                  const int      nbr_WEST,          // process id of my western neighbor
                  const int      nbr_EAST,          // process id of my eastern neighbor
                  const int      nbr_SOUTH,         // process id of my southern neighbor
                  const int      nbr_NORTH,         // process id of my northern neighbor
                  const int      nbr_BOTTOM,        // process id of my bottom neighbor
                  const int      nbr_TOP,           // process id of my top neighbor
                     double      *PDF4d)             // pointer to the 4D array being exchanged (of type double)
{
    MPI_Status status;

    const int MXP = nn+MX+nn;  // padded voxels along X
    const int MYP = nn+MY+nn;  // padded voxels along Y
    const int MZP = nn+MZ+nn;  // padded voxels along Z

    // regular voxels + voxels in the ghost layer
    const int PADDED_VOXELS = MXP*MYP*MZP;

    // allocate a 3D array for storing f(a)
    // ghost layers are included in this 3D array
    double *PDF3d = new double[PADDED_VOXELS]; 

    // loop for all PDF directions
    for (int a = 0; a < Q; a++)
    {
        // loop over all voxels in this MPI process, including ghost layers 
        for(int i = 0; i < MXP; i++) {
            for(int j = 0; j < MYP; j++) {
                for(int k = 0; k < MZP; k++) {

                    // natural index for fa(i,j,k) in PDF3d
                    int index_3d = i + j*MXP + k*MXP*MYP;

                    // natural index for f(i,j,k,a) in PDF4d
                    int index_4d = a + (index_3d * Q);

                    // PDF3d <---- PDF4d(a)
                    PDF3d[index_3d] = PDF4d[index_4d];
                }
            }
        }
 
        // 
        MPI_Datatype stridex;
        MPI_Type_vector( (MY+nn+nn)*(MZ+nn+nn), 1, MX+nn+nn, MPI_DOUBLE, &stridex);
        MPI_Type_commit( &stridex);

        //
        MPI_Datatype stridey;
        MPI_Type_vector( MZ+nn+nn, MX+nn+nn, (MY+nn+nn)*(MX+nn+nn), MPI_DOUBLE, &stridey);
        MPI_Type_commit( &stridey);

        // total number of values in a XY plane (contiguous values)
        //
        // example layout for the case nn = 1 (1 layer of ghost cells)
        //          
        //
        //         0,MY+1  1,MY+1  2,MY+1  3,MY+1  ...    MX,MY+1 MX+1,MY+1
        // 
        //              +-----------------------      ----------+     
        //              |                                       |
        //         0,MY |  1,MY    2,MY    3,MY    ...    MX,MY | MX+1,MY
        //              |                                       |
        //
        //
        //
        //
        //
        //         0,2  |  2,2     2,2     3,2     ...    MX,2  | MX+1,2
        //              |                                       |
        //              |                                       |
        //              |                                       |
        //         0,1  |  1,1     2,1     3,1     ...    MX,1  | MX+1,1
        //              |                                       |
        //              +-----------------------       ---------+     
        //
        //         0,0     1,0     2,0     3,0     ...    MX,0    MX+1,0
        //
        //
        int no_xy = MXP*MYP;  // we are only exchanging one PDF at a time


    // loop over the number of ghost layers
    for(int i = 0; i < nn; i++)
    {
        // I am sending PDF3d data to the process nbr_TOP and receiving PDF3d data from the process nbr_BOTTOM
        {
            //                                                 x   x   x   x   x   x
            // send the topmost (non-ghost) layer of data        +---------------+
            // receive this data into the ghost cell layer     S | S   S   S   S | S  --- send to nbr_TOP
            //                                                   |               |
            //                  ^                              x | o   o   o   o | x
            //                  |  Z-axis                        |               |
            //                  |                              x | o   o   o   o | x
            //                  |                                +---------------+
            //                  |                              R   R   R   R   R   R  --- recv from nbr_BOTTOM

            // SEND to top
            int sx = 0;
            int sy = 0;
            int sz = nn + (MZ-1) - i;

            // RECV from bottom
            int rx = 0;
            int ry = 0;
            int rz = (nn - 1) - i;

            int send = sx + sy * MXP + sz * MXP*MYP;  // send the topmost (non-ghost) layer of data
            int recv = rx + ry * MXP + rz * MXP*MYP;  // receive data into the bottom ghost cell layer

            MPI_Sendrecv(&PDF3d[send],       // send buffer (points to the starting address of the data chunk)
                         no_xy,              // number of elements to be sent
                         MPI_DOUBLE,         // type of elements
                         nbr_TOP,            // destination (where the data is going)
                         111,                // tag
                         &PDF3d[recv],       // receive buffer (points to the starting address of the data chunk)
                         no_xy,              // number of elements received
                         MPI_DOUBLE,         // type of elements
                         nbr_BOTTOM,         // source (where the data is coming from)
                         111,                // tag
                         CART_COMM,          // MPI Communicator used for this Sendrecv
                         &status);           // MPI status
        }

        // I am sending PDF3d data to the process nbr_BOTTOM and receiving PDF3d data from the process nbr_TOP
        {
            //                                                 R   R   R   R   R   R  --- recv from nbr_TOP
            // send the topmost (non-ghost) layer of data        +---------------+
            // receive this data into the ghost cell layer     x | o   o   o   o | x
            //                                                   |               |
            //                  ^                              x | o   o   o   o | x
            //                  |  Z-axis                        |               |
            //                  |                              S | S   S   S   S | S  --- send to nbr_BOTTOM
            //                  |                                +---------------+
            //                  |                              x   x   x   x   x   x

            // SEND to bottom
            int sx = 0;
            int sy = 0;
            int sz = nn + i;

            // RECV from top
            int rx = 0;
            int ry = 0;
            int rz = nn + MZ + i;

            int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data
            int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer

            MPI_Sendrecv(&PDF3d[send],       // send buffer (points to the starting address of the data chunk) 
                         no_xy,              // number of elements to be sent
                         MPI_DOUBLE,         // type of elements
                         nbr_BOTTOM,         // destination (where the data is going)
                         222,                // tag
                         &PDF3d[recv],       // receive buffer (points to the starting address of the data chunk)
                         no_xy,              // number of elements received
                         MPI_DOUBLE,         // type of elements
                         nbr_TOP,            // source (where the data is coming from)
                         222,                // tag
                         CART_COMM,          // MPI Communicator used for this Sendrecv
                         &status);           // MPI status
        }

        // I am sending PDF3d data to the process nbr_EAST and receiving PDF3d data from process nbr_WEST
        {
            //                                                 R   x   x   x   S   x
            // send the eastmost (non-ghost) layer of data       +---------------+
            // receive data into the west ghost cell layer     R | o   o   o   S | x     S --- send to nbr_EAST
            //                                                   |               |
            //                                                 R | o   o   o   S | x
            //       --------------> X-axis                      |               |
            //                                                 R | o   o   o   S | x
            //                                                   +---------------+
            //                                                 R   x   x   x   S   x     R --- recv from nbr_WEST

            // SEND to east
            int sx = nn + (MX-1) - i;
            int sy = 0;
            int sz = 0;

            // RECV from west
            int rx = (nn - 1) - i;
            int ry = 0;
            int rz = 0;

            int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data
            int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer

            MPI_Sendrecv(&PDF3d[send],       // send buffer (points to the starting address of the data chunk) 
                         1,                  // number of elements to be sent
                         stridex,            // type of elements
                         nbr_EAST,           // destination (where the data is going)
                         333,                // tag
                         &PDF3d[recv],       // receive buffer (points to the starting address of the data chunk)
                         1,                  // number of elements received
                         stridex,            // type of elements
                         nbr_WEST,           // source (where the data is coming from)
                         333,                // tag
                         CART_COMM,          // MPI Communicator used for this Sendrecv
                         &status);           // MPI status
        }

        // I am sending PDF3d data to the process nbr_WEST and receiving PDF3d data from process nbr_EAST
        {
            //                                                 x   S   x   x   x   R
            // send the westmost (non-ghost) layer of data       +---------------+
            // receive data into the east ghost cell layer     x | S   o   o   o | R     S --- send to nbr_WEST
            //                                                   |               |
            //                                                 x | S   o   o   o | R
            //       --------------> X-axis                      |               |
            //                                                 x | S   o   o   o | R
            //                                                   +---------------+
            //                                                 x   S   x   x   x   R     R --- recv from nbr_EAST

            // SEND to west
            int sx = nn + i;
            int sy = 0;
            int sz = 0;

            // RECV from east
            int rx = nn + MX + i;
            int ry = 0;
            int rz = 0;

            int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data
            int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer

            MPI_Sendrecv(&PDF3d[send],       // send buffer (points to the starting address of the data chunk) 
                         1,                  // number of elements to be sent
                         stridex,            // type of elements
                         nbr_WEST,           // destination (where the data is going)
                         444,                // tag
                         &PDF3d[recv],       // receive buffer (points to the starting address of the data chunk)
                         1,                  // number of elements received
                         stridex,            // type of elements
                         nbr_EAST,           // source (where the data is coming from)
                         444,                // tag
                         CART_COMM,          // MPI Communicator used for this Sendrecv
                         &status);           // MPI status
        }

        // I am sending PDF3d data to the process nbr_NORTH and receiving PDF3d data from process nbr_SOUTH
        {
            //                                                 R   x   x   x   S   x
            // send the northmost (non-ghost) layer of data      +---------------+
            // receive data into the south ghost cell layer    R | o   o   o   S | x     S --- send to nbr_NORTH
            //                                                   |               |
            //                                                 R | o   o   o   S | x
            //       --------------> Y-axis                      |               |
            //                                                 R | o   o   o   S | x
            //                                                   +---------------+
            //                                                 R   x   x   x   S   x     R --- recv from nbr_SOUTH

            // SEND to north
            int sx = 0;
            int sy = nn + (MY-1) - i;
            int sz = 0;

            // RECV from south
            int rx = 0;
            int ry = (nn - 1) - i;
            int rz = 0;

            int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data
            int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer

            MPI_Sendrecv(&PDF3d[send],       // send buffer (points to the starting address of the data chunk) 
                         1,                  // number of elements to be sent
                         stridey,            // type of elements
                         nbr_NORTH,          // destination (where the data is going)
                         555,                // tag
                         &PDF3d[recv],       // receive buffer (points to the starting address of the data chunk)
                         1,                  // number of elements received
                         stridey,            // type of elements
                         nbr_SOUTH,          // source (where the data is coming from)
                         555,                // tag
                         CART_COMM,          // MPI Communicator used for this Sendrecv
                         &status);           // MPI status
        }

        // I am sending PDF3d data to the process nbr_SOUTH and receiving PDF3d data from process nbr_NORTH
        {
            //                                                 x   S   x   x   x   R
            // send the southmost (non-ghost) layer of data      +---------------+
            // receive data into the north ghost cell layer    x | S   o   o   o | R     S --- send to nbr_SOUTH
            //                                                   |               |
            //                                                 x | S   o   o   o | R
            //       --------------> Y-axis                      |               |
            //                                                 x | S   o   o   o | R
            //                                                   +---------------+
            //                                                 x   S   x   x   x   R     R --- recv from nbr_NORTH

            // SEND to south
            int sx = 0;
            int sy = nn + i;
            int sz = 0;

            // RECV from north
            int rx = 0;
            int ry = nn + MY + i;
            int rz = 0;

            int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data
            int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer

            MPI_Sendrecv(&PDF3d[send],       // send buffer (points to the starting address of the data chunk) 
                         1,                  // number of elements to be sent
                         stridey,            // type of elements
                         nbr_SOUTH,          // destination (where the data is going)
                         666,                // tag
                         &PDF3d[recv],       // receive buffer (points to the starting address of the data chunk)
                         1,                  // number of elements received
                         stridey,            // type of elements
                         nbr_NORTH,          // source (where the data is coming from)
                         666,                // tag
                         CART_COMM,          // MPI Communicator used for this Sendrecv
                         &status);           // MPI status
        }

    } // end for loop over the number of ghost layers


        // loop over all voxels in this MPI process, including ghost layers 
        for(int i = 0; i < MXP; i++) {
            for(int j = 0; j < MYP; j++) {
                for(int k = 0; k < MZP; k++) {

                    // natural index for fa(i,j,k) in PDF3d
                    int index_3d = i + j*MXP + k*MXP*MYP;

                    // natural index for f(i,j,k,a) in PDF4d
                    int index_4d = a + (index_3d * Q);

                    // PDF4d <---- PDF3d(a)
                    PDF4d[index_4d] = PDF3d[index_3d];
                }
            }
        }

        // cleanup
        MPI_Type_free(&stridex);
        MPI_Type_free(&stridey);

    } // end loop for PDF directions

    // free memory for the temporary 3D array
    delete [] PDF3d;
}
Beispiel #27
0
/* ********************************************************************* */
int AL_Exchange_periods (void *vbuf, int *periods, int sz_ptr)
/*!
 * Same as AL_Exchange, but exchanges periodic
 * boundaries at physical domain in the dim direction
 * only if periods[dim] = 1.
 * If a dimension is not periodic and periods[dim] = 1
 * nothing changes.
 *
 * \param [in] vbuf     pointer to buffer
 * \param [in] periods  
 * \param [in] sz_ptr   integer pointer to the distributed array descriptor
 *********************************************************************** */
{
  char *buf;
  register int nd;
  int myrank, nproc;
  int ndim, gp, nleft, nright, tag1, tag2;
  int sendb, recvb;
  MPI_Datatype itype;
  MPI_Comm comm;
  MPI_Status status;
  SZ *s;
  int is_beg[3], is_end[3];

  buf = (char *) vbuf;

  /* -- DIAGNOSTICS
        Check that sz_ptr points to an allocated SZ
                                                     -- */
  if( stack_ptr[sz_ptr] == AL_STACK_FREE){
    printf("AL_Decompose: wrong SZ pointer\n");
  }

  s = sz_stack[sz_ptr];

  myrank = s->rank;
  nproc = s->size;
  comm = s->comm;
  ndim = s->ndim;

  AL_Is_boundary (sz_ptr, is_beg, is_end);

  for(nd=0;nd<ndim;nd++){
    gp = s->bg[nd];

    /* If gp=0, do nothing */

    if( gp > 0 ){
      nleft = s->left[nd];
      nright = s->right[nd];

      if (is_beg[nd] && periods[nd] == 0) nleft  = MPI_PROC_NULL;
      if (is_end[nd] && periods[nd] == 0) nright = MPI_PROC_NULL;
      
      itype = s->type_rl[nd];
      tag1 = s->tag1[nd];

      sendb = s->sendb1[nd];
      recvb = s->recvb1[nd];

      MPI_Sendrecv(&buf[sendb], 1, itype, nleft, tag1,
		   &buf[recvb], 1, itype, nright,tag1,
		   comm, &status);

      nleft = s->left[nd];
      nright = s->right[nd];

      if (is_beg[nd] && periods[nd] == 0) nleft  = MPI_PROC_NULL;
      if (is_end[nd] && periods[nd] == 0) nright = MPI_PROC_NULL;

      itype = s->type_lr[nd];
      tag2 = s->tag2[nd];

      sendb = s->sendb2[nd];
      recvb = s->recvb2[nd];

      MPI_Sendrecv(&buf[sendb], 1, itype, nright, tag2,
		   &buf[recvb], 1, itype, nleft,tag2,
		   comm, &status);
    }
  }

  /* DIAGNOSTICS */
#ifdef DEBUG
  printf("AL_Exchange: filled ghost regions\n"); 
#endif

  return (int) AL_SUCCESS;
}
Beispiel #28
0
void cs_find_hot_neighbours(void)
{
  MyFloat *Left, *Right;
  int nimport;
  int i, j, n, ndone_flag, dummy;
  int ndone, ntot, npleft;
  int iter = 0;
  int ngrp, sendTask, recvTask;
  int place, nexport;
  double dmax1, dmax2;
  double xhyd, yhel, ne, mu, energy, temp;
  double a3inv;


  if(All.ComovingIntegrationOn)
    a3inv = 1 / (All.Time * All.Time * All.Time);
  else
    a3inv = 1;

  /* allocate buffers to arrange communication */

  Left = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat));
  Right = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat));

  Ngblist = (int *) mymalloc(NumPart * sizeof(int));

  All.BunchSize =
    (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) +
					     sizeof(struct hotngbs_in) + sizeof(struct hotngbs_out) +
					     sizemax(sizeof(struct hotngbs_in), sizeof(struct hotngbs_out))));
  DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index));
  DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist));


  CPU_Step[CPU_MISC] += measure_time();




  for(n = FirstActiveParticle; n >= 0; n = NextActiveParticle[n])
    {
      if(P[n].Type == 0)
	{
	  /* select reservoir and cold phase particles */
	  if(P[n].EnergySN > 0 && SphP[n].d.Density * a3inv > All.PhysDensThresh * All.DensFrac_Phase)
	    {
	      xhyd = P[n].Zm[6] / P[n].Mass;
	      yhel = (1 - xhyd) / (4. * xhyd);

	      ne = SphP[n].Ne;
	      mu = (1 + 4 * yhel) / (1 + yhel + ne);
	      energy = SphP[n].Entropy * P[n].Mass / GAMMA_MINUS1 * pow(SphP[n].d.Density * a3inv, GAMMA_MINUS1);	/* Total Energys */
	      temp = GAMMA_MINUS1 / BOLTZMANN * energy / P[n].Mass * PROTONMASS * mu;
	      temp *= All.UnitEnergy_in_cgs / All.UnitMass_in_g;	/* Temperature in Kelvin */

	      if(temp < All.Tcrit_Phase)
		{
		  Left[n] = Right[n] = 0;

		  if(!(SphP[n].HotHsml > 0.))
		    SphP[n].HotHsml = All.InitialHotHsmlFactor * PPP[n].Hsml;	/* Estimation of HotHsml : ONLY first step */

		  P[n].Type = 10;	/* temporarily mark particles of interest with this number */
		}
	    }
	}
    }



  /* we will repeat the whole thing for those particles where we didn't find enough neighbours */
  do
    {
      i = FirstActiveParticle;	/* beginn with this index */

      do
	{
	  for(j = 0; j < NTask; j++)
	    {
	      Send_count[j] = 0;
	      Exportflag[j] = -1;
	    }

	  /* do local particles and prepare export list */

	  for(nexport = 0; i >= 0; i = NextActiveParticle[i])
	    if(P[i].Type == 10 && P[i].TimeBin >= 0)
	      {
		if(cs_hotngbs_evaluate(i, 0, &nexport, Send_count) < 0)
		  break;
	      }

#ifdef MYSORT
	  mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#else
	  qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#endif
	  MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD);

	  for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++)
	    {
	      Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask];
	      nimport += Recv_count[j];

	      if(j > 0)
		{
		  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
		  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
		}
	    }

	  HotNgbsGet = (struct hotngbs_in *) mymalloc(nimport * sizeof(struct hotngbs_in));
	  HotNgbsIn = (struct hotngbs_in *) mymalloc(nexport * sizeof(struct hotngbs_in));

	  /* prepare particle data for export */
	  for(j = 0; j < nexport; j++)
	    {
	      place = DataIndexTable[j].Index;

	      HotNgbsIn[j].Pos[0] = P[place].Pos[0];
	      HotNgbsIn[j].Pos[1] = P[place].Pos[1];
	      HotNgbsIn[j].Pos[2] = P[place].Pos[2];
	      HotNgbsIn[j].HotHsml = SphP[place].HotHsml;
	      HotNgbsIn[j].Entropy = SphP[place].Entropy;
	      memcpy(HotNgbsIn[j].NodeList,
		     DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int));
	    }


	  for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	    {
	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&HotNgbsIn[Send_offset[recvTask]],
				   Send_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE,
				   recvTask, TAG_DENS_A,
				   &HotNgbsGet[Recv_offset[recvTask]],
				   Recv_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE,
				   recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		    }
		}
	    }

	  myfree(HotNgbsIn);
	  HotNgbsResult = (struct hotngbs_out *) mymalloc(nimport * sizeof(struct hotngbs_out));
	  HotNgbsOut = (struct hotngbs_out *) mymalloc(nexport * sizeof(struct hotngbs_out));

	  /* now do the particles that need to be exported */
	  for(j = 0; j < nimport; j++)
	    cs_hotngbs_evaluate(j, 1, &dummy, &dummy);


	  if(i < 0)
	    ndone_flag = 1;
	  else
	    ndone_flag = 0;

	  MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);


	  /* get the result */
	  for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	    {
	      sendTask = ThisTask;
	      recvTask = ThisTask ^ ngrp;
	      if(recvTask < NTask)
		{
		  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&HotNgbsResult[Recv_offset[recvTask]],
				   Recv_count[recvTask] * sizeof(struct hotngbs_out),
				   MPI_BYTE, recvTask, TAG_DENS_B,
				   &HotNgbsOut[Send_offset[recvTask]],
				   Send_count[recvTask] * sizeof(struct hotngbs_out),
				   MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		    }
		}

	    }


	  /* add the result to the local particles */

	  for(j = 0; j < nexport; j++)
	    {
	      place = DataIndexTable[j].Index;

	      SphP[place].da.dDensityAvg += HotNgbsOut[j].DensitySum;
	      SphP[place].ea.dEntropyAvg += HotNgbsOut[j].EntropySum;
	      SphP[place].HotNgbNum += HotNgbsOut[j].HotNgbNum;
	    }

	  myfree(HotNgbsOut);
	  myfree(HotNgbsResult);
	  myfree(HotNgbsGet);
	}
      while(ndone < NTask);

      /* do final operations on results */
      for(i = FirstActiveParticle, npleft = 0; i >= 0; i = NextActiveParticle[i])
	{
	  if(P[i].Type == 10 && P[i].TimeBin >= 0)
	    {
#ifdef FLTROUNDOFFREDUCTION
	      SphP[i].da.DensityAvg = FLT(SphP[i].da.dDensityAvg);
	      SphP[i].ea.EntropyAvg = FLT(SphP[i].ea.dEntropyAvg);
#endif
	      if(SphP[i].HotNgbNum > 0)
		{
		  SphP[i].da.DensityAvg /= SphP[i].HotNgbNum;
		  SphP[i].ea.EntropyAvg /= SphP[i].HotNgbNum;
		}
	      else
		{
		  SphP[i].da.DensityAvg = 0;
		  SphP[i].ea.EntropyAvg = 0;
		}

	      /* now check whether we had enough neighbours */

	      if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation) ||
		 (SphP[i].HotNgbNum > (All.DesNumNgb + All.MaxNumHotNgbDeviation)))
		{
		  /* need to redo this particle */
		  npleft++;

		  if(Left[i] > 0 && Right[i] > 0)
		    if((Right[i] - Left[i]) < 1.0e-3 * Left[i])
		      {
			/* this one should be ok */
			npleft--;
			P[i].TimeBin = -P[i].TimeBin - 1;	/* Mark as inactive */
			continue;
		      }

		  if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation))
		    Left[i] = DMAX(SphP[i].HotHsml, Left[i]);
		  else
		    {
		      if(Right[i] != 0)
			{
			  if(SphP[i].HotHsml < Right[i])
			    Right[i] = SphP[i].HotHsml;
			}
		      else
			Right[i] = SphP[i].HotHsml;
		    }

		  if(Left[i] > All.MaxHotHsmlParam * PPP[i].Hsml)	/* prevent us from searching too far */
		    {
		      npleft--;
		      P[i].TimeBin = -P[i].TimeBin - 1;	/* Mark as inactive */


		      /* Ad-hoc definition of SAvg and RhoAvg when there are no hot neighbours  */
		      /* Note that a minimum nunmber of hot neighbours are required for promotion, see c_enrichment.c  */
		      if(SphP[i].HotNgbNum == 0)
			{
			  SphP[i].da.DensityAvg = SphP[i].d.Density / 100;
			  SphP[i].ea.EntropyAvg = SphP[i].Entropy * 1000;

			  printf("WARNING: Used ad-hoc values for SAvg and RhoAvg, No hot neighbours\n");
			}

		      continue;
		    }

		  if(iter >= MAXITER_HOT - 10)
		    {
		      printf
			("i=%d task=%d ID=%d Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n   pos=(%g|%g|%g)\n",
			 i, ThisTask, P[i].ID, SphP[i].HotHsml, Left[i], Right[i],
			 (float) SphP[i].HotNgbNum, Right[i] - Left[i], P[i].Pos[0], P[i].Pos[1],
			 P[i].Pos[2]);
		      fflush(stdout);
		    }

		  if(Right[i] > 0 && Left[i] > 0)
		    SphP[i].HotHsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3);
		  else
		    {
		      if(Right[i] == 0 && Left[i] == 0)
			endrun(8188);	/* can't occur */

		      if(Right[i] == 0 && Left[i] > 0)
			SphP[i].HotHsml *= 1.26;

		      if(Right[i] > 0 && Left[i] == 0)
			SphP[i].HotHsml /= 1.26;
		    }
		}
	      else
		P[i].TimeBin = -P[i].TimeBin - 1;	/* Mark as inactive */
	    }
	}


      MPI_Allreduce(&npleft, &ntot, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

      if(ntot > 0)
	{
	  iter++;

	  if(iter > 0 && ThisTask == 0)
	    {
	      printf("hotngb iteration %d: need to repeat for %d particles.\n", iter, ntot);
	      fflush(stdout);
	    }

	  if(iter > MAXITER_HOT)
	    {
	      printf("failed to converge in hot-neighbour iteration\n");
	      fflush(stdout);
	      endrun(1155);
	    }
	}
    }
  while(ntot > 0);


  myfree(DataNodeList);
  myfree(DataIndexTable);
  myfree(Ngblist);
  myfree(Right);
  myfree(Left);


  for(i = FirstActiveParticle; i >= 0; i = NextActiveParticle[i])
    if(P[i].Type == 10)
      {
	P[i].Type = 0;
	/* mark as active again */
	if(P[i].TimeBin < 0)
	  P[i].TimeBin = -P[i].TimeBin - 1;
      }


  CPU_Step[CPU_HOTNGBS] += measure_time();

}
static void Zoltan_RB_Gather(
   ZOLTAN_GNO_TYPE *send,                 /* input/output array */
   ZOLTAN_GNO_TYPE *tmp_send,             /* temporary array */
   int proclower,             /* smallest numbered processor in partition */
   int rank,                  /* processor number within partition */
   int nprocs,                /* number of processors in this partition */
   MPI_Comm comm              /* MPI Communicator */
)
{
   int tag = 32100;           /* message tag */
   int partner;               /* message partner in binary exchange */
   int to;                    /* message partner not in binary exchange */
   int mask;                  /* mask to determine communication partner */
   int nprocs_small;          /* largest power of 2 contained in nprocs */
   int hbit;                  /* 2^hbit = nproc_small */
   int len;                   /* message length */
   int i;                     /* loop counter */
   MPI_Status status;
   MPI_Datatype zoltan_gno_mpi_type;

   zoltan_gno_mpi_type = Zoltan_mpi_gno_type();

   /* This routine sums a vector of integers on a subset of processors */

   len = 3*nprocs;

   /* Find next lower power of 2. */
   for (hbit = 0; (nprocs >> hbit) != 1; hbit++);

   nprocs_small = 1 << hbit;
   if (nprocs_small * 2 == nprocs) {
      nprocs_small *= 2;
      hbit++;
   }

   to = proclower + (rank ^ nprocs_small);
   if (rank & nprocs_small) {  /* processors greater than largest power of 2 */
      MPI_Send(send, len, zoltan_gno_mpi_type, to, tag, comm);
      tag += hbit + 1;
      MPI_Recv(send, len, zoltan_gno_mpi_type, to, tag, comm, &status);
   }
   else {   /* processors within greatest power of 2 */
      if (rank + nprocs_small < nprocs) {
         MPI_Recv(tmp_send, len, zoltan_gno_mpi_type, to, tag, comm, &status);
         for (i = 0; i < len; i++)
            send[i] += tmp_send[i];
      }
      for (mask = nprocs_small >> 1; mask; mask >>= 1) { /* binary exchange */
         tag++;
         partner = proclower + (rank ^ mask);
         /* Change requested by Qingyu Meng <*****@*****.**> to        */
         /* support mvapich 1.0 on TACC Ranger.                            */
         /* MPI_Send(send, len, zoltan_gno_mpi_type, partner, tag, comm);              */
         /* MPI_Recv(tmp_send, len, zoltan_gno_mpi_type, partner, tag, comm, &status); */
         MPI_Sendrecv(send, len, zoltan_gno_mpi_type, partner, tag,
            tmp_send, len, zoltan_gno_mpi_type, partner, tag, comm, &status);

         for (i = 0; i < len; i++)
            send[i] += tmp_send[i];
      }
      tag++;
      if (rank + nprocs_small < nprocs)
         MPI_Send(send, len, zoltan_gno_mpi_type, to, tag, comm);
   }
}
Beispiel #30
0
void IMB_sendrecv(struct comm_info* c_info, int size,  struct iter_schedule* ITERATIONS,
                  MODES RUN_MODE, double* time)
/*

                      
                      MPI-1 benchmark kernel
                      Benchmarks MPI_Sendrecv
                      


Input variables: 

-c_info               (type struct comm_info*)                      
                      Collection of all base data for MPI;
                      see [1] for more information
                      

-size                 (type int)                      
                      Basic message size in bytes

-ITERATIONS           (type struct iter_schedule *)
                      Repetition scheduling

-RUN_MODE             (type MODES)                      
                      (only MPI-2 case: see [1])


Output variables: 

-time                 (type double*)                      
                      Timing result per sample


*/
{
  double t1,t2;
  int i;
  Type_Size s_size, r_size;
  int s_num,r_num;
  int s_tag, r_tag;
  int dest, source;
  MPI_Status stat;

#ifdef CHECK 
  defect=0;
#endif
  ierr = 0;

  /*  GET SIZE OF DATA TYPE's in s_size and r_size */  
  MPI_Type_size(c_info->s_data_type,&s_size);
  MPI_Type_size(c_info->r_data_type,&r_size);
  if ((s_size!=0) && (r_size!=0))
    {
      s_num=size/s_size;
      r_num=size/r_size;
    }   
  s_tag = 1;
  r_tag = MPI_ANY_TAG;
  
  if(c_info->rank!=-1)
    {  
      /*  CALCULATE SOURCE AND DESTINATION */  
      dest   = (c_info->rank + 1)                   % (c_info->num_procs);
      source = (c_info->rank + c_info->num_procs-1) % (c_info->num_procs);

      for(i=0; i<N_BARR; i++) MPI_Barrier(c_info->communicator);
      
      t1 = MPI_Wtime();
      for(i=0;i< ITERATIONS->n_sample;i++)
	{
	  ierr= MPI_Sendrecv((char*)c_info->s_buffer+i%ITERATIONS->s_cache_iter*ITERATIONS->s_offs,
                             s_num,c_info->s_data_type, dest,s_tag,
                             (char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs,
                             r_num,c_info->r_data_type,source,r_tag,
			     c_info->communicator,&stat);
	  MPI_ERRHAND(ierr);

          CHK_DIFF("Sendrecv",c_info,(char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs,
                    0, size, size, asize,
                    put, 0, ITERATIONS->n_sample, i,
                    source, &defect);
	}
      t2 = MPI_Wtime();
      *time=(t2 - t1)/ITERATIONS->n_sample;
    }
  else
    { 
      *time = 0.;
    }
}