/*! This routine finds all neighbours `j' that can interact with
 *  \f$ r_{ij} < h_i \f$  OR if  \f$ r_{ij} < h_j \f$.
 */
int subfind_ngb_treefind_linkpairs(MyDouble searchcenter[3], double hsml, int target, int *startnode,
				   int mode, double *hmax, int *nexport, int *nsend_local)
{
  int numngb, i, no, p, task, nexport_save, exported = 0;
  struct NODE *current;
  double dx, dy, dz, dist, r2;

#ifdef PERIODIC
  MyDouble xtmp;
#endif

  nexport_save = *nexport;

  *hmax = 0;
  numngb = 0;
  no = *startnode;

  while(no >= 0)
    {
      if(no < All.MaxPart)	/* single particle */
	{
	  p = no;
	  no = Nextnode[no];

#ifdef DENSITY_SPLIT_BY_TYPE
	  if(!((1 << P[p].Type) & (DENSITY_SPLIT_BY_TYPE)))
#else
	  if(!((1 << P[p].Type) & (FOF_PRIMARY_LINK_TYPES)))
#endif
	    continue;

	  dist = DMAX(P[p].DM_Hsml, hsml);
	  dx = NGB_PERIODIC_LONG_X(P[p].Pos[0] - searchcenter[0]);
	  if(dx > dist)
	    continue;
	  dy = NGB_PERIODIC_LONG_Y(P[p].Pos[1] - searchcenter[1]);
	  if(dy > dist)
	    continue;
	  dz = NGB_PERIODIC_LONG_Z(P[p].Pos[2] - searchcenter[2]);
	  if(dz > dist)
	    continue;
	  if((r2 = (dx * dx + dy * dy + dz * dz)) > dist * dist)
	    continue;

	  Dist2list[numngb] = r2;
	  Ngblist[numngb++] = p;
	}
      else
	{
	  if(no >= All.MaxPart + MaxNodes)	/* pseudo particle */
	    {
	      if(mode == 1)
		endrun(12312);

	      if(target >= 0)	/* if no target is given, export will not occur */
		{
		  exported = 1;
		  if(Exportflag[task = DomainTask[no - (All.MaxPart + MaxNodes)]] != target)
		    {
		      Exportflag[task] = target;
		      Exportnodecount[task] = NODELISTLENGTH;
		    }

		  if(Exportnodecount[task] == NODELISTLENGTH)
		    {
		      if(*nexport >= All.BunchSize)
			{
			  *nexport = nexport_save;
			  if(nexport_save == 0)
			    endrun(13004);	/* in this case, the buffer is too small to process even a single particle */
			  for(task = 0; task < NTask; task++)
			    nsend_local[task] = 0;
			  for(no = 0; no < nexport_save; no++)
			    nsend_local[DataIndexTable[no].Task]++;
			  return -1;
			}
		      Exportnodecount[task] = 0;
		      Exportindex[task] = *nexport;
		      DataIndexTable[*nexport].Task = task;
		      DataIndexTable[*nexport].Index = target;
		      DataIndexTable[*nexport].IndexGet = *nexport;
		      *nexport = *nexport + 1;
		      nsend_local[task]++;
		    }

		  DataNodeList[Exportindex[task]].NodeList[Exportnodecount[task]++] =
		    DomainNodeIndex[no - (All.MaxPart + MaxNodes)];

		  if(Exportnodecount[task] < NODELISTLENGTH)
		    DataNodeList[Exportindex[task]].NodeList[Exportnodecount[task]] = -1;
		}

	      no = Nextnode[no - MaxNodes];
	      continue;
	    }

	  current = &Nodes[no];

	  if(mode == 1)
	    {
	      if(current->u.d.bitflags & (1 << BITFLAG_TOPLEVEL))	/* we reached a top-level node again, which means that we are done with the branch */
		{
		  *startnode = -1;
		  return numngb;
		}
	    }

	  dist = DMAX(Extnodes[no].hmax, hsml) + 0.5 * current->len;
	  no = current->u.d.sibling;	/* in case the node can be discarded */
	  dx = NGB_PERIODIC_LONG_X(current->center[0] - searchcenter[0]);
	  if(dx > dist)
	    continue;
	  dy = NGB_PERIODIC_LONG_Y(current->center[1] - searchcenter[1]);
	  if(dy > dist)
	    continue;
	  dz = NGB_PERIODIC_LONG_Z(current->center[2] - searchcenter[2]);
	  if(dz > dist)
	    continue;
	  /* now test against the minimal sphere enclosing everything */
	  dist += FACT1 * current->len;
	  if(dx * dx + dy * dy + dz * dz > dist * dist)
	    continue;

	  no = current->u.d.nextnode;	/* ok, we need to open the node */
	}
    }


  if(mode == 0)			/* local particle */
    if(exported == 0)		/* completely local */
      if(numngb >= All.DesNumNgb)
	{
	  R2list = mymalloc("	  R2list", sizeof(struct r2data) * numngb);
	  for(i = 0; i < numngb; i++)
	    {
	      R2list[i].index = Ngblist[i];
	      R2list[i].r2 = Dist2list[i];
	    }

#ifdef OMP_SORT
	  omp_qsort(R2list, numngb, sizeof(struct r2data), subfind_ngb_compare_dist);
#else
	  qsort(R2list, numngb, sizeof(struct r2data), subfind_ngb_compare_dist);
#endif

	  *hmax = sqrt(R2list[All.DesNumNgb - 1].r2);
	  numngb = All.DesNumNgb;

	  for(i = 0; i < numngb; i++)
	    {
	      Ngblist[i] = R2list[i].index;
	      Dist2list[i] = R2list[i].r2;
	    }

	  myfree(R2list);
	}


  *startnode = -1;
  return numngb;
}
Example #2
0
/* sample_sort routine */
void sample_sort(void* input, size_t total_elems, size_t size,
				 int (*compar)(const void*, const void*),
				 MPI_Comm comm, int num_procs)
{
	int my_rank, root = 0;
	int elems_per_block, elems_to_sort;
	int count, temp;
	int *local_input;
	int *splitters, *all_splitters;
	int *buckets, *bucket_buffer, *bucket_local;
	int *output_buffer;
	int i, j, k;
	int split_size;
	int *total_input = (int *)input;
    double start_time, end_time;

    start_time = MPI_Wtime();
	MPI_Comm_rank(comm, &my_rank);
	is_root = (my_rank == 0);


	/* STEP 1:
	 * Each processor gets n/p elements as local input 
	 */
	elems_per_block = total_elems / num_procs;
	local_input = (int *)malloc(elems_per_block * sizeof(int));

	MPI_Scatter(total_input, elems_per_block, MPI_INT, local_input,
		    	elems_per_block, MPI_INT, root, MPI_COMM_WORLD);

	/* STEP 2:
	 * Each processor sorts its local data of n/p elements
	 */
	#if OMP
		omp_qsort(local_input, elems_per_block, sizeof(int), comparator);
	#else 
		qsort((char *)local_input, elems_per_block, sizeof(int), comparator);
	#endif

	/* STEP 3:
	 * Each processor selects (p-1) splitters evenly
	 */
	splitters = (int *)malloc(sizeof(int) * (num_procs - 1));
	/* FIXME:
	 * We don't think we need omp for here because the number of procs
	 * will be very small and the overhead of threads creation will
	 * defeat the purpose
	 */
	for (i = 0; i < (num_procs - 1); i++) {
		splitters[i] = local_input[total_elems / (num_procs * num_procs) * (i + 1)];
	}

  	/* STEP 4:
	 * All processors send their chosen (p-1) splitters to ROOT.
	 * Note that there will be p(p-1) splitters in total.
	 */
	all_splitters = (int *)malloc(sizeof(int) * num_procs * (num_procs - 1));
	MPI_Gather(splitters, num_procs - 1, MPI_INT, all_splitters, num_procs - 1,
		   MPI_INT, root, MPI_COMM_WORLD);

	/* STEP 5:
	 * ROOT processor sorts the list of splitters received and 
	 * generates (p-1) global splitters. This global list of splitters
	 * is sent to every processor.
	 */
	if (is_root) {
		/* FIXME:
		 * Since there are p(p-1) splitters to be sorted,
		 * using omp_qsort will be an over kill here
		 */
		qsort((char *)all_splitters, num_procs * (num_procs - 1),
		      sizeof(int), comparator);

		for (i = 0; i < num_procs - 1; i++) {
			splitters[i] = all_splitters[(num_procs - 1) * (i + 1)];
		}
	}
	MPI_Bcast(splitters, num_procs - 1, MPI_INT, 0, MPI_COMM_WORLD);


	/* STEP 6:
	 * Each processor creates p buckets locally and dumps its local data into
	 * corresponding buckets based on the splitters received from ROOT
	 */
	buckets = (int *)malloc(sizeof(int) * (total_elems + num_procs));

	j = 0;
	k = 1;

	/* Let Ai be the sub-array at process i.
	*  Ai,j is the portion of this sub-array that should go to
	*  process j. This portion has the elements which are less than
	*  jth splitter
	*/
	for (i = 0; i < elems_per_block; i++) {
		if (j < (num_procs - 1)) {
			if (local_input[i] < splitters[j])
				buckets[((elems_per_block + 1) * j) + k++] =
				    local_input[i];
			else {
				/* The first element of each sub-local-array Ai,j
				 * will be the count of elements in it.
				 */
				buckets[(elems_per_block + 1) * j] = k - 1;
				/* Now start a new sub-local_array 
				 * Set k back to 1 and increment the j
				 */
				k = 1;
				j++;
				i--;
			}
		} else {
			/* The j = p-1 which is the last segment, 
			*  Ai,j shall contain the remaining elements
			*/
			buckets[((elems_per_block + 1) * j) + k++] = local_input[i];
		}
	}

	/* this is to update the last segment when j=p-1 */
	buckets[(elems_per_block + 1) * j] = k - 1;

	/* STEP 7:
	 * Each processor sends its local buckets to the corresponding 
	 * processors. The local-buckets will be of the form:
	 * --------------------------
	 * |count|elem1|elem2|elem3....  
	 * --------------------------
	 * Note that the size will be n/p + 1 (extra one for the count part)
	 */

	bucket_buffer = (int *)malloc(sizeof(int) * (total_elems + num_procs));

	MPI_Alltoall(buckets, elems_per_block + 1, MPI_INT, bucket_buffer,
		     	 elems_per_block + 1, MPI_INT, MPI_COMM_WORLD);

	/* STEP 8:
	 * Each processor rearranges the data received from others.
	 * Note that each processor can get at most 2n/p elements.
	 */
	bucket_local = (int *)malloc(sizeof(int) * 2 * total_elems / num_procs);

	count = 1;

	for (j = 0; j < num_procs; j++) {
		k = 1;
		for (i = 0;
		     i < bucket_buffer[(total_elems / num_procs + 1) * j]; i++)
			bucket_local[count++] =
			    bucket_buffer[(total_elems / num_procs + 1) * j +
					 k++];
	}
	bucket_local[0] = count - 1;
	elems_to_sort = bucket_local[0];
	#if OMP
		omp_qsort(&bucket_local[1], elems_to_sort, sizeof(int), comparator);
	#else
		qsort((char *)&bucket_local[1], elems_to_sort, sizeof(int), comparator);
	#endif

	/* STEP 9:
	 * ROOT gathers all the sorted local-arrays
	 * into output_buffer. Every local-bucket is
	 * of 2n/p size. So we need output-buffer of
	 * size 2n.
	 */  
	if (is_root) {
		output_buffer = (int *)malloc(sizeof(int) * 2 * total_elems);
	}

	MPI_Gather(bucket_local, 2 * elems_per_block, MPI_INT, output_buffer,
		   2 * elems_per_block, MPI_INT, root, MPI_COMM_WORLD);

    end_time = MPI_Wtime();

	/* STEP 10: 
	 * Rearrange the output_buffer to get the sorted list
	 */
	if (is_root) {
		count = 0;

		for (j = 0; j < num_procs; j++) {
			k = 1;
			for (i = 0;
			     i <
			     output_buffer[(2 * total_elems / num_procs) * j];
			     i++)
				total_input[count++] =
				    output_buffer[(2 * total_elems / num_procs) * j + k++];
		}

        if (verbose) {
            printf("Elements to be sorted : %zu \n", total_elems);
            printf("Sorted output is:\n\n");
            for (i = 0; i < total_elems; i++) {
                printf("%d\n", total_input[i]);
            }
            printf(" \n ");
        }

        printf("=== sample_sort time = %lf ===\n", (end_time - start_time));

		free(total_input);
		free(output_buffer);
	}

	/* Clean up ourselves */
	free(local_input);
	free(splitters);
	free(all_splitters);
	free(buckets);
	free(bucket_buffer);
	free(bucket_local);
}
void subfind_find_linkngb(void)
{
  long long ntot;
  int i, j, ndone, ndone_flag, npleft, dummy, iter = 0, save_DesNumNgb;
  MyFloat *Left, *Right;
  char *Todo;
  int ngrp, recvTask, place, nexport, nimport;
  double t0, t1;


  if(ThisTask == 0)
    printf("Start find_linkngb (%d particles on task=%d)\n", NumPartGroup, ThisTask);

  save_DesNumNgb = All.DesNumNgb;
  All.DesNumNgb = All.DesLinkNgb;	/* for simplicity, reset this value */


  /* allocate buffers to arrange communication */

  Ngblist = (int *) mymalloc("Ngblist", NumPartGroup * sizeof(int));
  Dist2list = (double *) mymalloc("Dist2list", NumPartGroup * sizeof(double));

  All.BunchSize =
    (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) +
					     sizeof(struct linkngbdata_in) + sizeof(struct linkngbdata_out) +
					     sizemax(sizeof(struct linkngbdata_in),
						     sizeof(struct linkngbdata_out))));
  DataIndexTable =
    (struct data_index *) mymalloc("DataIndexTable", All.BunchSize * sizeof(struct data_index));
  DataNodeList =
    (struct data_nodelist *) mymalloc("DataNodeList", All.BunchSize * sizeof(struct data_nodelist));

  Left = mymalloc("Left", sizeof(MyFloat) * NumPartGroup);
  Right = mymalloc("Right", sizeof(MyFloat) * NumPartGroup);
  Todo = mymalloc("Todo", sizeof(char) * NumPartGroup);

  for(i = 0; i < NumPartGroup; i++)
    {
      Left[i] = Right[i] = 0;
      Todo[i] = 1;
    }

  /* we will repeat the whole thing for those particles where we didn't find enough neighbours */
  do
    {
      t0 = second();

      i = 0;			/* begin with this index */

      do
	{
	  for(j = 0; j < NTask; j++)
	    {
	      Send_count[j] = 0;
	      Exportflag[j] = -1;
	    }

	  /* do local particles and prepare export list */

	  for(nexport = 0; i < NumPartGroup; i++)
	    {
	      if(Todo[i])
		{
		  if(subfind_linkngb_evaluate(i, 0, &nexport, Send_count) < 0)
		    break;
		}
	    }

#ifdef OMP_SORT
	  omp_qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#else
	  qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare);
#endif

	  MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, MPI_COMM_WORLD);

	  for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++)
	    {
	      nimport += Recv_count[j];

	      if(j > 0)
		{
		  Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1];
		  Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1];
		}
	    }

	  LinkngbDataGet =
	    (struct linkngbdata_in *) mymalloc("	  LinkngbDataGet",
					       nimport * sizeof(struct linkngbdata_in));
	  LinkngbDataIn =
	    (struct linkngbdata_in *) mymalloc("	  LinkngbDataIn",
					       nexport * sizeof(struct linkngbdata_in));

	  /* prepare particle data for export */
	  for(j = 0; j < nexport; j++)
	    {
	      place = DataIndexTable[j].Index;

	      LinkngbDataIn[j].Pos[0] = P[place].Pos[0];
	      LinkngbDataIn[j].Pos[1] = P[place].Pos[1];
	      LinkngbDataIn[j].Pos[2] = P[place].Pos[2];
	      LinkngbDataIn[j].DM_Hsml = P[place].DM_Hsml;

	      memcpy(LinkngbDataIn[j].NodeList,
		     DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int));
	    }

	  /* exchange particle data */
	  for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	    {
	      recvTask = ThisTask ^ ngrp;

	      if(recvTask < NTask)
		{
		  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		    {
		      /* get the particles */
		      MPI_Sendrecv(&LinkngbDataIn[Send_offset[recvTask]],
				   Send_count[recvTask] * sizeof(struct linkngbdata_in), MPI_BYTE,
				   recvTask, TAG_DENS_A,
				   &LinkngbDataGet[Recv_offset[recvTask]],
				   Recv_count[recvTask] * sizeof(struct linkngbdata_in), MPI_BYTE,
				   recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		    }
		}
	    }

	  myfree(LinkngbDataIn);
	  LinkngbDataResult =
	    (struct linkngbdata_out *) mymalloc("	  LinkngbDataResult",
						nimport * sizeof(struct linkngbdata_out));
	  LinkngbDataOut =
	    (struct linkngbdata_out *) mymalloc("	  LinkngbDataOut",
						nexport * sizeof(struct linkngbdata_out));


	  /* now do the particles that were sent to us */
	  for(j = 0; j < nimport; j++)
	    subfind_linkngb_evaluate(j, 1, &dummy, &dummy);

	  if(i >= NumPartGroup)
	    ndone_flag = 1;
	  else
	    ndone_flag = 0;

	  MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

	  /* get the result */
	  for(ngrp = 1; ngrp < (1 << PTask); ngrp++)
	    {
	      recvTask = ThisTask ^ ngrp;
	      if(recvTask < NTask)
		{
		  if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0)
		    {
		      /* send the results */
		      MPI_Sendrecv(&LinkngbDataResult[Recv_offset[recvTask]],
				   Recv_count[recvTask] * sizeof(struct linkngbdata_out),
				   MPI_BYTE, recvTask, TAG_DENS_B,
				   &LinkngbDataOut[Send_offset[recvTask]],
				   Send_count[recvTask] * sizeof(struct linkngbdata_out),
				   MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		    }
		}
	    }

	  /* add the result to the local particles */
	  for(j = 0; j < nexport; j++)
	    {
	      place = DataIndexTable[j].Index;

	      P[place].DM_NumNgb += LinkngbDataOut[j].Ngb;
	    }


	  myfree(LinkngbDataOut);
	  myfree(LinkngbDataResult);
	  myfree(LinkngbDataGet);
	}
      while(ndone < NTask);

      /* do final operations on results */
      for(i = 0, npleft = 0; i < NumPartGroup; i++)
	{
	  /* now check whether we had enough neighbours */
	  if(Todo[i])
	    {
	      if(P[i].DM_NumNgb != All.DesLinkNgb &&
		 ((Right[i] - Left[i]) > 1.0e-3 * Left[i] || Left[i] == 0 || Right[i] == 0))
		{
		  /* need to redo this particle */
		  npleft++;

		  if(P[i].DM_NumNgb < All.DesLinkNgb)
		    Left[i] = DMAX(P[i].DM_Hsml, Left[i]);
		  else
		    {
		      if(Right[i] != 0)
			{
			  if(P[i].DM_Hsml < Right[i])
			    Right[i] = P[i].DM_Hsml;
			}
		      else
			Right[i] = P[i].DM_Hsml;
		    }

		  if(iter >= MAXITER - 10)
		    {
		      printf
			("i=%d task=%d ID=%d DM_Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n   pos=(%g|%g|%g)\n",
			 i, ThisTask, (int) P[i].ID, P[i].DM_Hsml, Left[i], Right[i],
			 (double) P[i].DM_NumNgb, Right[i] - Left[i], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]);
		      fflush(stdout);
		    }

		  if(Right[i] > 0 && Left[i] > 0)
		    P[i].DM_Hsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3);
		  else
		    {
		      if(Right[i] == 0 && Left[i] == 0)
			endrun(8189);	/* can't occur */

		      if(Right[i] == 0 && Left[i] > 0)
			P[i].DM_Hsml *= 1.26;

		      if(Right[i] > 0 && Left[i] == 0)
			P[i].DM_Hsml /= 1.26;
		    }
		}
	      else
		Todo[i] = 0;
	    }
	}


      sumup_large_ints(1, &npleft, &ntot);

      t1 = second();

      if(ntot > 0)
	{
	  iter++;

	  if(iter > 0 && ThisTask == 0)
	    {
	      printf("find linkngb iteration %d: need to repeat for %d%09d particles. (took %g sec)\n", iter,
		     (int) (ntot / 1000000000), (int) (ntot % 1000000000), timediff(t0, t1));
	      fflush(stdout);
	    }

	  if(iter > MAXITER)
	    {
	      printf("failed to converge in neighbour iteration in density()\n");
	      fflush(stdout);
	      endrun(1155);
	    }
	}
    }
  while(ntot > 0);

  myfree(Todo);
  myfree(Right);
  myfree(Left);

  myfree(DataNodeList);
  myfree(DataIndexTable);

  myfree(Dist2list);
  myfree(Ngblist);

  All.DesNumNgb = save_DesNumNgb;	/* restore it */
}