/*! This routine finds all neighbours `j' that can interact with * \f$ r_{ij} < h_i \f$ OR if \f$ r_{ij} < h_j \f$. */ int subfind_ngb_treefind_linkpairs(MyDouble searchcenter[3], double hsml, int target, int *startnode, int mode, double *hmax, int *nexport, int *nsend_local) { int numngb, i, no, p, task, nexport_save, exported = 0; struct NODE *current; double dx, dy, dz, dist, r2; #ifdef PERIODIC MyDouble xtmp; #endif nexport_save = *nexport; *hmax = 0; numngb = 0; no = *startnode; while(no >= 0) { if(no < All.MaxPart) /* single particle */ { p = no; no = Nextnode[no]; #ifdef DENSITY_SPLIT_BY_TYPE if(!((1 << P[p].Type) & (DENSITY_SPLIT_BY_TYPE))) #else if(!((1 << P[p].Type) & (FOF_PRIMARY_LINK_TYPES))) #endif continue; dist = DMAX(P[p].DM_Hsml, hsml); dx = NGB_PERIODIC_LONG_X(P[p].Pos[0] - searchcenter[0]); if(dx > dist) continue; dy = NGB_PERIODIC_LONG_Y(P[p].Pos[1] - searchcenter[1]); if(dy > dist) continue; dz = NGB_PERIODIC_LONG_Z(P[p].Pos[2] - searchcenter[2]); if(dz > dist) continue; if((r2 = (dx * dx + dy * dy + dz * dz)) > dist * dist) continue; Dist2list[numngb] = r2; Ngblist[numngb++] = p; } else { if(no >= All.MaxPart + MaxNodes) /* pseudo particle */ { if(mode == 1) endrun(12312); if(target >= 0) /* if no target is given, export will not occur */ { exported = 1; if(Exportflag[task = DomainTask[no - (All.MaxPart + MaxNodes)]] != target) { Exportflag[task] = target; Exportnodecount[task] = NODELISTLENGTH; } if(Exportnodecount[task] == NODELISTLENGTH) { if(*nexport >= All.BunchSize) { *nexport = nexport_save; if(nexport_save == 0) endrun(13004); /* in this case, the buffer is too small to process even a single particle */ for(task = 0; task < NTask; task++) nsend_local[task] = 0; for(no = 0; no < nexport_save; no++) nsend_local[DataIndexTable[no].Task]++; return -1; } Exportnodecount[task] = 0; Exportindex[task] = *nexport; DataIndexTable[*nexport].Task = task; DataIndexTable[*nexport].Index = target; DataIndexTable[*nexport].IndexGet = *nexport; *nexport = *nexport + 1; nsend_local[task]++; } DataNodeList[Exportindex[task]].NodeList[Exportnodecount[task]++] = DomainNodeIndex[no - (All.MaxPart + MaxNodes)]; if(Exportnodecount[task] < NODELISTLENGTH) DataNodeList[Exportindex[task]].NodeList[Exportnodecount[task]] = -1; } no = Nextnode[no - MaxNodes]; continue; } current = &Nodes[no]; if(mode == 1) { if(current->u.d.bitflags & (1 << BITFLAG_TOPLEVEL)) /* we reached a top-level node again, which means that we are done with the branch */ { *startnode = -1; return numngb; } } dist = DMAX(Extnodes[no].hmax, hsml) + 0.5 * current->len; no = current->u.d.sibling; /* in case the node can be discarded */ dx = NGB_PERIODIC_LONG_X(current->center[0] - searchcenter[0]); if(dx > dist) continue; dy = NGB_PERIODIC_LONG_Y(current->center[1] - searchcenter[1]); if(dy > dist) continue; dz = NGB_PERIODIC_LONG_Z(current->center[2] - searchcenter[2]); if(dz > dist) continue; /* now test against the minimal sphere enclosing everything */ dist += FACT1 * current->len; if(dx * dx + dy * dy + dz * dz > dist * dist) continue; no = current->u.d.nextnode; /* ok, we need to open the node */ } } if(mode == 0) /* local particle */ if(exported == 0) /* completely local */ if(numngb >= All.DesNumNgb) { R2list = mymalloc(" R2list", sizeof(struct r2data) * numngb); for(i = 0; i < numngb; i++) { R2list[i].index = Ngblist[i]; R2list[i].r2 = Dist2list[i]; } #ifdef OMP_SORT omp_qsort(R2list, numngb, sizeof(struct r2data), subfind_ngb_compare_dist); #else qsort(R2list, numngb, sizeof(struct r2data), subfind_ngb_compare_dist); #endif *hmax = sqrt(R2list[All.DesNumNgb - 1].r2); numngb = All.DesNumNgb; for(i = 0; i < numngb; i++) { Ngblist[i] = R2list[i].index; Dist2list[i] = R2list[i].r2; } myfree(R2list); } *startnode = -1; return numngb; }
/* sample_sort routine */ void sample_sort(void* input, size_t total_elems, size_t size, int (*compar)(const void*, const void*), MPI_Comm comm, int num_procs) { int my_rank, root = 0; int elems_per_block, elems_to_sort; int count, temp; int *local_input; int *splitters, *all_splitters; int *buckets, *bucket_buffer, *bucket_local; int *output_buffer; int i, j, k; int split_size; int *total_input = (int *)input; double start_time, end_time; start_time = MPI_Wtime(); MPI_Comm_rank(comm, &my_rank); is_root = (my_rank == 0); /* STEP 1: * Each processor gets n/p elements as local input */ elems_per_block = total_elems / num_procs; local_input = (int *)malloc(elems_per_block * sizeof(int)); MPI_Scatter(total_input, elems_per_block, MPI_INT, local_input, elems_per_block, MPI_INT, root, MPI_COMM_WORLD); /* STEP 2: * Each processor sorts its local data of n/p elements */ #if OMP omp_qsort(local_input, elems_per_block, sizeof(int), comparator); #else qsort((char *)local_input, elems_per_block, sizeof(int), comparator); #endif /* STEP 3: * Each processor selects (p-1) splitters evenly */ splitters = (int *)malloc(sizeof(int) * (num_procs - 1)); /* FIXME: * We don't think we need omp for here because the number of procs * will be very small and the overhead of threads creation will * defeat the purpose */ for (i = 0; i < (num_procs - 1); i++) { splitters[i] = local_input[total_elems / (num_procs * num_procs) * (i + 1)]; } /* STEP 4: * All processors send their chosen (p-1) splitters to ROOT. * Note that there will be p(p-1) splitters in total. */ all_splitters = (int *)malloc(sizeof(int) * num_procs * (num_procs - 1)); MPI_Gather(splitters, num_procs - 1, MPI_INT, all_splitters, num_procs - 1, MPI_INT, root, MPI_COMM_WORLD); /* STEP 5: * ROOT processor sorts the list of splitters received and * generates (p-1) global splitters. This global list of splitters * is sent to every processor. */ if (is_root) { /* FIXME: * Since there are p(p-1) splitters to be sorted, * using omp_qsort will be an over kill here */ qsort((char *)all_splitters, num_procs * (num_procs - 1), sizeof(int), comparator); for (i = 0; i < num_procs - 1; i++) { splitters[i] = all_splitters[(num_procs - 1) * (i + 1)]; } } MPI_Bcast(splitters, num_procs - 1, MPI_INT, 0, MPI_COMM_WORLD); /* STEP 6: * Each processor creates p buckets locally and dumps its local data into * corresponding buckets based on the splitters received from ROOT */ buckets = (int *)malloc(sizeof(int) * (total_elems + num_procs)); j = 0; k = 1; /* Let Ai be the sub-array at process i. * Ai,j is the portion of this sub-array that should go to * process j. This portion has the elements which are less than * jth splitter */ for (i = 0; i < elems_per_block; i++) { if (j < (num_procs - 1)) { if (local_input[i] < splitters[j]) buckets[((elems_per_block + 1) * j) + k++] = local_input[i]; else { /* The first element of each sub-local-array Ai,j * will be the count of elements in it. */ buckets[(elems_per_block + 1) * j] = k - 1; /* Now start a new sub-local_array * Set k back to 1 and increment the j */ k = 1; j++; i--; } } else { /* The j = p-1 which is the last segment, * Ai,j shall contain the remaining elements */ buckets[((elems_per_block + 1) * j) + k++] = local_input[i]; } } /* this is to update the last segment when j=p-1 */ buckets[(elems_per_block + 1) * j] = k - 1; /* STEP 7: * Each processor sends its local buckets to the corresponding * processors. The local-buckets will be of the form: * -------------------------- * |count|elem1|elem2|elem3.... * -------------------------- * Note that the size will be n/p + 1 (extra one for the count part) */ bucket_buffer = (int *)malloc(sizeof(int) * (total_elems + num_procs)); MPI_Alltoall(buckets, elems_per_block + 1, MPI_INT, bucket_buffer, elems_per_block + 1, MPI_INT, MPI_COMM_WORLD); /* STEP 8: * Each processor rearranges the data received from others. * Note that each processor can get at most 2n/p elements. */ bucket_local = (int *)malloc(sizeof(int) * 2 * total_elems / num_procs); count = 1; for (j = 0; j < num_procs; j++) { k = 1; for (i = 0; i < bucket_buffer[(total_elems / num_procs + 1) * j]; i++) bucket_local[count++] = bucket_buffer[(total_elems / num_procs + 1) * j + k++]; } bucket_local[0] = count - 1; elems_to_sort = bucket_local[0]; #if OMP omp_qsort(&bucket_local[1], elems_to_sort, sizeof(int), comparator); #else qsort((char *)&bucket_local[1], elems_to_sort, sizeof(int), comparator); #endif /* STEP 9: * ROOT gathers all the sorted local-arrays * into output_buffer. Every local-bucket is * of 2n/p size. So we need output-buffer of * size 2n. */ if (is_root) { output_buffer = (int *)malloc(sizeof(int) * 2 * total_elems); } MPI_Gather(bucket_local, 2 * elems_per_block, MPI_INT, output_buffer, 2 * elems_per_block, MPI_INT, root, MPI_COMM_WORLD); end_time = MPI_Wtime(); /* STEP 10: * Rearrange the output_buffer to get the sorted list */ if (is_root) { count = 0; for (j = 0; j < num_procs; j++) { k = 1; for (i = 0; i < output_buffer[(2 * total_elems / num_procs) * j]; i++) total_input[count++] = output_buffer[(2 * total_elems / num_procs) * j + k++]; } if (verbose) { printf("Elements to be sorted : %zu \n", total_elems); printf("Sorted output is:\n\n"); for (i = 0; i < total_elems; i++) { printf("%d\n", total_input[i]); } printf(" \n "); } printf("=== sample_sort time = %lf ===\n", (end_time - start_time)); free(total_input); free(output_buffer); } /* Clean up ourselves */ free(local_input); free(splitters); free(all_splitters); free(buckets); free(bucket_buffer); free(bucket_local); }
void subfind_find_linkngb(void) { long long ntot; int i, j, ndone, ndone_flag, npleft, dummy, iter = 0, save_DesNumNgb; MyFloat *Left, *Right; char *Todo; int ngrp, recvTask, place, nexport, nimport; double t0, t1; if(ThisTask == 0) printf("Start find_linkngb (%d particles on task=%d)\n", NumPartGroup, ThisTask); save_DesNumNgb = All.DesNumNgb; All.DesNumNgb = All.DesLinkNgb; /* for simplicity, reset this value */ /* allocate buffers to arrange communication */ Ngblist = (int *) mymalloc("Ngblist", NumPartGroup * sizeof(int)); Dist2list = (double *) mymalloc("Dist2list", NumPartGroup * sizeof(double)); All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + sizeof(struct linkngbdata_in) + sizeof(struct linkngbdata_out) + sizemax(sizeof(struct linkngbdata_in), sizeof(struct linkngbdata_out)))); DataIndexTable = (struct data_index *) mymalloc("DataIndexTable", All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc("DataNodeList", All.BunchSize * sizeof(struct data_nodelist)); Left = mymalloc("Left", sizeof(MyFloat) * NumPartGroup); Right = mymalloc("Right", sizeof(MyFloat) * NumPartGroup); Todo = mymalloc("Todo", sizeof(char) * NumPartGroup); for(i = 0; i < NumPartGroup; i++) { Left[i] = Right[i] = 0; Todo[i] = 1; } /* we will repeat the whole thing for those particles where we didn't find enough neighbours */ do { t0 = second(); i = 0; /* begin with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i < NumPartGroup; i++) { if(Todo[i]) { if(subfind_linkngb_evaluate(i, 0, &nexport, Send_count) < 0) break; } } #ifdef OMP_SORT omp_qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Alltoall(Send_count, 1, MPI_INT, Recv_count, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } LinkngbDataGet = (struct linkngbdata_in *) mymalloc(" LinkngbDataGet", nimport * sizeof(struct linkngbdata_in)); LinkngbDataIn = (struct linkngbdata_in *) mymalloc(" LinkngbDataIn", nexport * sizeof(struct linkngbdata_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; LinkngbDataIn[j].Pos[0] = P[place].Pos[0]; LinkngbDataIn[j].Pos[1] = P[place].Pos[1]; LinkngbDataIn[j].Pos[2] = P[place].Pos[2]; LinkngbDataIn[j].DM_Hsml = P[place].DM_Hsml; memcpy(LinkngbDataIn[j].NodeList, DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int)); } /* exchange particle data */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&LinkngbDataIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct linkngbdata_in), MPI_BYTE, recvTask, TAG_DENS_A, &LinkngbDataGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct linkngbdata_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } myfree(LinkngbDataIn); LinkngbDataResult = (struct linkngbdata_out *) mymalloc(" LinkngbDataResult", nimport * sizeof(struct linkngbdata_out)); LinkngbDataOut = (struct linkngbdata_out *) mymalloc(" LinkngbDataOut", nexport * sizeof(struct linkngbdata_out)); /* now do the particles that were sent to us */ for(j = 0; j < nimport; j++) subfind_linkngb_evaluate(j, 1, &dummy, &dummy); if(i >= NumPartGroup) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get the result */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* send the results */ MPI_Sendrecv(&LinkngbDataResult[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct linkngbdata_out), MPI_BYTE, recvTask, TAG_DENS_B, &LinkngbDataOut[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct linkngbdata_out), MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } /* add the result to the local particles */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; P[place].DM_NumNgb += LinkngbDataOut[j].Ngb; } myfree(LinkngbDataOut); myfree(LinkngbDataResult); myfree(LinkngbDataGet); } while(ndone < NTask); /* do final operations on results */ for(i = 0, npleft = 0; i < NumPartGroup; i++) { /* now check whether we had enough neighbours */ if(Todo[i]) { if(P[i].DM_NumNgb != All.DesLinkNgb && ((Right[i] - Left[i]) > 1.0e-3 * Left[i] || Left[i] == 0 || Right[i] == 0)) { /* need to redo this particle */ npleft++; if(P[i].DM_NumNgb < All.DesLinkNgb) Left[i] = DMAX(P[i].DM_Hsml, Left[i]); else { if(Right[i] != 0) { if(P[i].DM_Hsml < Right[i]) Right[i] = P[i].DM_Hsml; } else Right[i] = P[i].DM_Hsml; } if(iter >= MAXITER - 10) { printf ("i=%d task=%d ID=%d DM_Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n pos=(%g|%g|%g)\n", i, ThisTask, (int) P[i].ID, P[i].DM_Hsml, Left[i], Right[i], (double) P[i].DM_NumNgb, Right[i] - Left[i], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]); fflush(stdout); } if(Right[i] > 0 && Left[i] > 0) P[i].DM_Hsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3); else { if(Right[i] == 0 && Left[i] == 0) endrun(8189); /* can't occur */ if(Right[i] == 0 && Left[i] > 0) P[i].DM_Hsml *= 1.26; if(Right[i] > 0 && Left[i] == 0) P[i].DM_Hsml /= 1.26; } } else Todo[i] = 0; } } sumup_large_ints(1, &npleft, &ntot); t1 = second(); if(ntot > 0) { iter++; if(iter > 0 && ThisTask == 0) { printf("find linkngb iteration %d: need to repeat for %d%09d particles. (took %g sec)\n", iter, (int) (ntot / 1000000000), (int) (ntot % 1000000000), timediff(t0, t1)); fflush(stdout); } if(iter > MAXITER) { printf("failed to converge in neighbour iteration in density()\n"); fflush(stdout); endrun(1155); } } } while(ntot > 0); myfree(Todo); myfree(Right); myfree(Left); myfree(DataNodeList); myfree(DataIndexTable); myfree(Dist2list); myfree(Ngblist); All.DesNumNgb = save_DesNumNgb; /* restore it */ }