/* copy files to a partner node */ static int scr_reddesc_apply_partner( scr_filemap* map, const scr_reddesc* c, int id) { int rc = SCR_SUCCESS; /* get pointer to partner state structure */ scr_reddesc_partner* state = (scr_reddesc_partner*) c->copy_state; /* get a list of our files */ int numfiles = 0; char** files = NULL; scr_filemap_list_files(map, id, scr_my_rank_world, &numfiles, &files); /* first, determine how many files we'll be sending and receiving * with our partners */ MPI_Status status; int send_num = numfiles; int recv_num = 0; MPI_Sendrecv( &send_num, 1, MPI_INT, state->rhs_rank, 0, &recv_num, 1, MPI_INT, state->lhs_rank, 0, c->comm, &status ); /* record how many files our partner will send */ scr_filemap_set_expected_files(map, id, state->lhs_rank_world, recv_num); /* remember which node our partner is on (needed for scavenge) */ scr_hash* flushdesc = scr_hash_new(); scr_filemap_get_flushdesc(map, id, state->lhs_rank_world, flushdesc); scr_hash_util_set_int(flushdesc, SCR_SCAVENGE_KEY_PRESERVE, scr_preserve_directories); scr_hash_util_set_int(flushdesc, SCR_SCAVENGE_KEY_CONTAINER, scr_use_containers); scr_hash_util_set_str(flushdesc, SCR_SCAVENGE_KEY_PARTNER, state->lhs_hostname); scr_filemap_set_flushdesc(map, id, state->lhs_rank_world, flushdesc); scr_hash_delete(&flushdesc); /* record partner's redundancy descriptor hash */ scr_hash* lhs_desc_hash = scr_hash_new(); scr_hash* my_desc_hash = scr_hash_new(); scr_reddesc_store_to_hash(c, my_desc_hash); scr_hash_sendrecv(my_desc_hash, state->rhs_rank, lhs_desc_hash, state->lhs_rank, c->comm); scr_filemap_set_desc(map, id, state->lhs_rank_world, lhs_desc_hash); scr_hash_delete(&my_desc_hash); scr_hash_delete(&lhs_desc_hash); /* store this info in our filemap before we receive any files */ scr_filemap_write(scr_map_file, map); /* define directory to receive partner file in */ char* dir = scr_cache_dir_get(c, id); /* for each potential file, step through a call to swap */ while (send_num > 0 || recv_num > 0) { /* assume we won't send or receive in this step */ int send_rank = MPI_PROC_NULL; int recv_rank = MPI_PROC_NULL; /* if we have a file left to send, * get the filename and destination rank */ char* file = NULL; if (send_num > 0) { int i = numfiles - send_num; file = files[i]; send_rank = state->rhs_rank; send_num--; } /* if we have a file left to receive, get the rank */ if (recv_num > 0) { recv_rank = state->lhs_rank; recv_num--; } /* exhange file names with partners */ char file_partner[SCR_MAX_FILENAME]; scr_swap_file_names(file, send_rank, file_partner, sizeof(file_partner), recv_rank, dir, c->comm); /* if we'll receive a file, record the name of our partner's * file in the filemap */ if (recv_rank != MPI_PROC_NULL) { scr_filemap_add_file(map, id, state->lhs_rank_world, file_partner); scr_filemap_write(scr_map_file, map); } /* get meta data of file we're sending */ scr_meta* send_meta = scr_meta_new(); scr_filemap_get_meta(map, id, scr_my_rank_world, file, send_meta); /* exhange files with partners */ scr_meta* recv_meta = scr_meta_new(); if (scr_swap_files(COPY_FILES, file, send_meta, send_rank, file_partner, recv_meta, recv_rank, c->comm) != SCR_SUCCESS) { rc = SCR_FAILURE; } scr_filemap_set_meta(map, id, state->lhs_rank_world, file_partner, recv_meta); /* free meta data for these files */ scr_meta_delete(&recv_meta); scr_meta_delete(&send_meta); } /* free cache directory string */ scr_free(&dir); /* write out the updated filemap */ scr_filemap_write(scr_map_file, map); /* free our list of files */ scr_free(&files); return rc; }
/* ********************************************************************* */ int AL_Exchange(void *vbuf, int sz_ptr) /*! * Fill the ghost boundaries * * \param [in] vbuf pointer to buffer * \param [in] sz_ptr integer pointer to the distributed array descriptor *********************************************************************** */ { char *buf; register int nd; int myrank, nproc; int ndim, gp, nleft, nright, tag1, tag2; int sendb, recvb; MPI_Datatype itype; MPI_Comm comm; MPI_Status status; SZ *s; buf = (char *) vbuf; /* DIAGNOSTICS Check that sz_ptr points to an allocated SZ */ if( stack_ptr[sz_ptr] == AL_STACK_FREE){ printf("AL_Decompose: wrong SZ pointer\n"); } s = sz_stack[sz_ptr]; myrank = s->rank; nproc = s->size; comm = s->comm; ndim = s->ndim; for(nd=0;nd<ndim;nd++){ gp = s->bg[nd]; /* If gp=0, do nothing */ if( gp > 0 ){ nleft = s->left[nd]; nright = s->right[nd]; itype = s->type_rl[nd]; tag1 = s->tag1[nd]; sendb = s->sendb1[nd]; recvb = s->recvb1[nd]; MPI_Sendrecv(&buf[sendb], 1, itype, nleft, tag1, &buf[recvb], 1, itype, nright,tag1, comm, &status); nleft = s->left[nd]; nright = s->right[nd]; itype = s->type_lr[nd]; tag2 = s->tag2[nd]; sendb = s->sendb2[nd]; recvb = s->recvb2[nd]; MPI_Sendrecv(&buf[sendb], 1, itype, nright, tag2, &buf[recvb], 1, itype, nleft,tag2, comm, &status); } } /* DIAGNOSTICS */ #ifdef DEBUG if(myrank==0) printf("AL_Exchange: filled ghost regions\n"); #endif return (int) AL_SUCCESS; }
/* This function computes the gravitational potential for ALL the particles. * It expects that the particles are predicted to the current time. */ void compute_potential(void) { int i; #ifndef NOGRAVITY long long ntot, ntotleft; int j, k, level, sendTask, recvTask; int ndone; int maxfill, ngrp, place, nexport; int *nsend, *noffset, *nsend_local, *nbuffer, *ndonelist, *numlist; double fac; double t0, t1, tstart, tend; MPI_Status status; double r2; t0 = second(); if(All.ComovingIntegrationOn) set_softenings(); if(ThisTask == 0) { printf("Start computation of potential for all particles...\n"); fflush(stdout); } #ifdef ISOTHERM for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].Potential = -2 * ISOTHERM * ISOTHERM * (1 + log(ISOTHERM / sqrt(r2))); } return; #endif tstart = second(); if(TreeReconstructFlag) { if(ThisTask == 0) printf("Tree construction.\n"); #if defined(SFR) || defined(BLACK_HOLES) rearrange_particle_sequence(); #endif force_treebuild(); TreeReconstructFlag = 0; if(ThisTask == 0) printf("Tree construction done.\n"); } tend = second(); All.CPU_TreeConstruction += timediff(tstart, tend); numlist = malloc(NTask * sizeof(int) * NTask); MPI_Allgather(&NumPart, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); ndonelist = malloc(sizeof(int) * NTask); i = 0; /* beginn with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ for(nexport = 0, ndone = 0; i < NumPart && nexport < All.BunchSizeForce - NTask; i++) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; #ifndef PMGRID force_treeevaluate_potential(i, 0); #else force_treeevaluate_potential_shortrange(i, 0); #endif for(j = 0; j < NTask; j++) { if(Exportflag[j]) { for(k = 0; k < 3; k++) GravDataGet[nexport].u.Pos[k] = P[i].Pos[k]; #ifdef UNEQUALSOFTENINGS GravDataGet[nexport].v.Type = P[i].Type; #endif GravDataGet[nexport].w.OldAcc = P[i].OldAcc; GravDataIndexTable[nexport].Task = j; GravDataIndexTable[nexport].Index = i; GravDataIndexTable[nexport].SortIndex = nexport; nexport++; nsend_local[j]++; } } } qsort(GravDataIndexTable, nexport, sizeof(struct gravdata_index), grav_tree_compare_key); for(j = 0; j < nexport; j++) GravDataIn[j] = GravDataGet[GravDataIndexTable[j].SortIndex]; for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&GravDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, &GravDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } for(j = 0; j < nbuffer[ThisTask]; j++) { #ifndef PMGRID force_treeevaluate_potential(j, 1); #else force_treeevaluate_potential_shortrange(j, 1); #endif } /* get the result */ for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&GravDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_B, &GravDataOut[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct gravdata_in), MPI_BYTE, recvTask, TAG_POTENTIAL_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { place = GravDataIndexTable[noffset[recvTask] + j].Index; P[place].Potential += GravDataOut[j + noffset[recvTask]].v.Potential; } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } level = ngrp - 1; } MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; } free(ndonelist); free(nsend); free(nsend_local); free(nbuffer); free(noffset); /* add correction to exclude self-potential */ for(i = 0; i < NumPart; i++) { /* remove self-potential */ P[i].Potential += P[i].Mass / All.SofteningTable[P[i].Type]; if(All.ComovingIntegrationOn) if(All.PeriodicBoundariesOn) P[i].Potential -= 2.8372975 * pow(P[i].Mass, 2.0 / 3) * pow(All.Omega0 * 3 * All.Hubble * All.Hubble / (8 * M_PI * All.G), 1.0 / 3); } /* multiply with the gravitational constant */ for(i = 0; i < NumPart; i++) P[i].Potential *= All.G; #ifdef PMGRID #ifdef PERIODIC pmpotential_periodic(); #ifdef PLACEHIGHRESREGION pmpotential_nonperiodic(1); #endif #else pmpotential_nonperiodic(0); #ifdef PLACEHIGHRESREGION pmpotential_nonperiodic(1); #endif #endif #endif if(All.ComovingIntegrationOn) { #ifndef PERIODIC fac = -0.5 * All.Omega0 * All.Hubble * All.Hubble; for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].Potential += fac * r2; } #endif } else { fac = -0.5 * All.OmegaLambda * All.Hubble * All.Hubble; if(fac != 0) { for(i = 0; i < NumPart; i++) { for(k = 0, r2 = 0; k < 3; k++) r2 += P[i].Pos[k] * P[i].Pos[k]; P[i].Potential += fac * r2; } } } if(ThisTask == 0) { printf("potential done.\n"); fflush(stdout); } t1 = second(); All.CPU_Potential += timediff(t0, t1); #else for(i = 0; i < NumPart; i++) P[i].Potential = 0; #endif }
/*! Calculates the long-range periodic force given the particle positions * using the PM method. The force is Gaussian filtered with Asmth, given in * mesh-cell units. We carry out a CIC charge assignment, and compute the * potenial by Fourier transform methods. The potential is finite differenced * using a 4-point finite differencing formula, and the forces are * interpolated tri-linearly to the particle positions. The CIC kernel is * deconvolved. Note that the particle distribution is not in the slab * decomposition that is used for the FFT. Instead, overlapping patches * between local domains and FFT slabs are communicated as needed. */ void pmforce_periodic(void) { double k2, kx, ky, kz, smth; double dx, dy, dz; double fx, fy, fz, ff; double asmth2, fac, acc_dim; int i, j, slab, level, sendTask, recvTask; int x, y, z, xl, yl, zl, xr, yr, zr, xll, yll, zll, xrr, yrr, zrr, ip, dim; int slab_x, slab_y, slab_z; int slab_xx, slab_yy, slab_zz; int meshmin[3], meshmax[3], sendmin, sendmax, recvmin, recvmax; int rep, ncont, cont_sendmin[2], cont_sendmax[2], cont_recvmin[2], cont_recvmax[2]; int dimx, dimy, dimz, recv_dimx, recv_dimy, recv_dimz; MPI_Status status; if(ThisTask == 0) { printf("Starting periodic PM calculation.\n"); fflush(stdout); } #ifdef FFTW3 if(fftw_plan_exists) { /* macro defined in callgrind.h */ // CALLGRIND_START_INSTRUMENTATION; } #else // CALLGRIND_START_INSTRUMENTATION; #endif force_treefree(); asmth2 = (2 * M_PI) * All.Asmth[0] / All.BoxSize; asmth2 *= asmth2; fac = All.G / (M_PI * All.BoxSize); /* to get potential */ fac *= 1 / (2 * All.BoxSize / PMGRID); /* for finite differencing */ /* first, establish the extension of the local patch in the PMGRID */ for(j = 0; j < 3; j++) { meshmin[j] = PMGRID; meshmax[j] = 0; } for(i = 0; i < NumPart; i++) { for(j = 0; j < 3; j++) { slab = to_slab_fac * P[i].Pos[j]; if(slab >= PMGRID) slab = PMGRID - 1; if(slab < meshmin[j]) meshmin[j] = slab; if(slab > meshmax[j]) meshmax[j] = slab; } } MPI_Allgather(meshmin, 3, MPI_INT, meshmin_list, 3, MPI_INT, MPI_COMM_WORLD); MPI_Allgather(meshmax, 3, MPI_INT, meshmax_list, 3, MPI_INT, MPI_COMM_WORLD); dimx = meshmax[0] - meshmin[0] + 2; dimy = meshmax[1] - meshmin[1] + 2; dimz = meshmax[2] - meshmin[2] + 2; pm_init_periodic_allocate((dimx + 4) * (dimy + 4) * (dimz + 4)); #ifdef FFTW3 if(!fftw_plan_exists) { /* Create plan for in-place r2c DFT */ fft_forward_plan = fftw_mpi_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid, fft_of_rhogrid, MPI_COMM_WORLD, FFTW_PATIENT | FFTW_MPI_TRANSPOSED_OUT); fft_inverse_plan = fftw_mpi_plan_dft_c2r_3d(PMGRID, PMGRID, PMGRID, fft_of_rhogrid, rhogrid, MPI_COMM_WORLD, FFTW_PATIENT | FFTW_MPI_TRANSPOSED_IN); fftw_plan_exists = true; // use C99 bool type if(ThisTask == 0) printf("Created new FFTW3 plan.\n"); } else { /* do nothing, the plan has already been created by previous call to this function */ } #endif /* For FFTW3, there is a different convention for fftsize for real-to-complex transforms, i.e. fftsize is the size of the complex data (number of complex values), NOT the size of the real data! We attempt to take care of this by defining fftsize to be fftsize_real when using FFTW3. */ for(i = 0; i < dimx * dimy * dimz; i++) workspace[i] = 0; for(i = 0; i < NumPart; i++) { slab_x = to_slab_fac * P[i].Pos[0]; if(slab_x >= PMGRID) slab_x = PMGRID - 1; dx = to_slab_fac * P[i].Pos[0] - slab_x; slab_x -= meshmin[0]; slab_xx = slab_x + 1; slab_y = to_slab_fac * P[i].Pos[1]; if(slab_y >= PMGRID) slab_y = PMGRID - 1; dy = to_slab_fac * P[i].Pos[1] - slab_y; slab_y -= meshmin[1]; slab_yy = slab_y + 1; slab_z = to_slab_fac * P[i].Pos[2]; if(slab_z >= PMGRID) slab_z = PMGRID - 1; dz = to_slab_fac * P[i].Pos[2] - slab_z; slab_z -= meshmin[2]; slab_zz = slab_z + 1; workspace[(slab_x * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz); workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * dy * (1.0 - dz); workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * dz; workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * dy * dz; workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (dx) * (1.0 - dy) * (1.0 - dz); workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (dx) * dy * (1.0 - dz); workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (dx) * (1.0 - dy) * dz; workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (dx) * dy * dz; } for(i = 0; i < fftsize; i++) /* clear local density field */ rhogrid[i] = 0; for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */ { sendTask = ThisTask; recvTask = ThisTask ^ level; if(recvTask < NTask) { /* check how much we have to send */ sendmin = 2 * PMGRID; sendmax = -1; for(slab_x = meshmin[0]; slab_x < meshmax[0] + 2; slab_x++) if(slab_to_task[slab_x % PMGRID] == recvTask) { if(slab_x < sendmin) sendmin = slab_x; if(slab_x > sendmax) sendmax = slab_x; } if(sendmax == -1) sendmin = 0; /* check how much we have to receive */ recvmin = 2 * PMGRID; recvmax = -1; for(slab_x = meshmin_list[3 * recvTask]; slab_x < meshmax_list[3 * recvTask] + 2; slab_x++) if(slab_to_task[slab_x % PMGRID] == sendTask) { if(slab_x < recvmin) recvmin = slab_x; if(slab_x > recvmax) recvmax = slab_x; } if(recvmax == -1) recvmin = 0; if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0) /* ok, we have a contribution to the slab */ { recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 2; recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 2; recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 2; if(level > 0) { MPI_Sendrecv(workspace + (sendmin - meshmin[0]) * dimy * dimz, (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_A, forcegrid, (recvmax - recvmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_A, MPI_COMM_WORLD, &status); } else { memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz, (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real)); } for(slab_x = recvmin; slab_x <= recvmax; slab_x++) { slab_xx = (slab_x % PMGRID) - first_slab_of_task[ThisTask]; if(slab_xx >= 0 && slab_xx < slabs_per_task[ThisTask]) { for(slab_y = meshmin_list[3 * recvTask + 1]; slab_y <= meshmax_list[3 * recvTask + 1] + 1; slab_y++) { slab_yy = slab_y; if(slab_yy >= PMGRID) slab_yy -= PMGRID; for(slab_z = meshmin_list[3 * recvTask + 2]; slab_z <= meshmax_list[3 * recvTask + 2] + 1; slab_z++) { slab_zz = slab_z; if(slab_zz >= PMGRID) slab_zz -= PMGRID; rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] += forcegrid[((slab_x - recvmin) * recv_dimy + (slab_y - meshmin_list[3 * recvTask + 1])) * recv_dimz + (slab_z - meshmin_list[3 * recvTask + 2])]; } } } } } } } #ifdef DEBUG_FFT double norm_density = 0.; for(i = 0; i < fftsize; i++) { norm_density += rhogrid[i]*rhogrid[i]; } /* Write out rhogrid to a 'fft-snapshot' file */ if (ThisTask == 0) { FILE *fp; /* Print the norm of the fft */ printf("L2-norm of density: %f\n", norm_density); printf("First five values of density: %f, %f, %f, %f, %f\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3], rhogrid[4]); } #endif /* Do the FFT of the density field */ #ifdef FFTW3 fftw_execute_dft_r2c(fft_forward_plan, rhogrid, fft_of_rhogrid); #else rfftwnd_mpi(fft_forward_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER); #endif #ifdef DEBUG_FFT double norm_complex = 0.; for(i = 0; i < fftsize; i++) { norm_complex += rhogrid[i]*rhogrid[i]; } /* Write out rhogrid to a 'fft-snapshot' file */ if (ThisTask == 0) { FILE *fp; /* Print the norm of the fft */ printf("L2-norm of complex rhogrid: %f\n", norm_complex); printf("First two values of complex fft: %f + i*%f, %f + i*%f\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3]); } #endif /* multiply with Green's function for the potential */ for(y = slabstart_y; y < slabstart_y + nslab_y; y++) for(x = 0; x < PMGRID; x++) for(z = 0; z < PMGRID / 2 + 1; z++) { if(x > PMGRID / 2) kx = x - PMGRID; else kx = x; if(y > PMGRID / 2) ky = y - PMGRID; else ky = y; if(z > PMGRID / 2) kz = z - PMGRID; else kz = z; k2 = kx * kx + ky * ky + kz * kz; if(k2 > 0) { smth = -exp(-k2 * asmth2) / k2; /* do deconvolution */ fx = fy = fz = 1; if(kx != 0) { fx = (M_PI * kx) / PMGRID; fx = sin(fx) / fx; } if(ky != 0) { fy = (M_PI * ky) / PMGRID; fy = sin(fy) / fy; } if(kz != 0) { fz = (M_PI * kz) / PMGRID; fz = sin(fz) / fz; } ff = 1 / (fx * fy * fz); smth *= ff * ff * ff * ff; /* end deconvolution */ ip = PMGRID * (PMGRID / 2 + 1) * (y - slabstart_y) + (PMGRID / 2 + 1) * x + z; c_re(fft_of_rhogrid[ip]) *= smth; c_im(fft_of_rhogrid[ip]) *= smth; } } if(slabstart_y == 0) c_re(fft_of_rhogrid[0]) = c_im(fft_of_rhogrid[0]) = 0.0; /* Do the FFT to get the potential */ #ifdef FFTW3 fftw_execute_dft_c2r(fft_inverse_plan, fft_of_rhogrid, rhogrid); /* Now normalize the output for(i = 0; i < fftsize; i++) rhogrid[i] = rhogrid[i] / (PMGRID*PMGRID*PMGRID); */ #else rfftwnd_mpi(fft_inverse_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER); #endif #ifdef DEBUG_FFT double norm = 0.; for(i = 0; i < fftsize; i++) { norm += rhogrid[i]*rhogrid[i]; } /* Write out rhogrid to a 'fft-snapshot' file */ if (ThisTask == 0) { FILE *fp; /* Print the norm of the fft */ printf("L2-norm of rhogrid: %f\n", norm); printf("First five values of fft: %f, %f, %f, %f, %f\n\n", rhogrid[0],rhogrid[1],rhogrid[2],rhogrid[3],rhogrid[4]); /* fp = fopen("rhogrid.0", "wb"); /* add suffix to indicate which node this is */ /* fwrite(rhogrid, sizeof(rhogrid[0]), fftsize, fp);*/ /* fclose(fp); */ } #endif /* Now rhogrid holds the potential */ /* construct the potential for the local patch */ dimx = meshmax[0] - meshmin[0] + 6; dimy = meshmax[1] - meshmin[1] + 6; dimz = meshmax[2] - meshmin[2] + 6; for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */ { sendTask = ThisTask; recvTask = ThisTask ^ level; if(recvTask < NTask) { /* check how much we have to send */ sendmin = 2 * PMGRID; sendmax = -PMGRID; for(slab_x = meshmin_list[3 * recvTask] - 2; slab_x < meshmax_list[3 * recvTask] + 4; slab_x++) if(slab_to_task[(slab_x + PMGRID) % PMGRID] == sendTask) { if(slab_x < sendmin) sendmin = slab_x; if(slab_x > sendmax) sendmax = slab_x; } if(sendmax == -PMGRID) sendmin = sendmax + 1; /* check how much we have to receive */ recvmin = 2 * PMGRID; recvmax = -PMGRID; for(slab_x = meshmin[0] - 2; slab_x < meshmax[0] + 4; slab_x++) if(slab_to_task[(slab_x + PMGRID) % PMGRID] == recvTask) { if(slab_x < recvmin) recvmin = slab_x; if(slab_x > recvmax) recvmax = slab_x; } if(recvmax == -PMGRID) recvmin = recvmax + 1; if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0) /* ok, we have a contribution to the slab */ { recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 6; recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 6; recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 6; ncont = 1; cont_sendmin[0] = sendmin; cont_sendmax[0] = sendmax; cont_sendmin[1] = sendmax + 1; cont_sendmax[1] = sendmax; cont_recvmin[0] = recvmin; cont_recvmax[0] = recvmax; cont_recvmin[1] = recvmax + 1; cont_recvmax[1] = recvmax; for(slab_x = sendmin; slab_x <= sendmax; slab_x++) { if(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask) { /* non-contiguous */ cont_sendmax[0] = slab_x - 1; while(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask) slab_x++; cont_sendmin[1] = slab_x; ncont++; } } for(slab_x = recvmin; slab_x <= recvmax; slab_x++) { if(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask) { /* non-contiguous */ cont_recvmax[0] = slab_x - 1; while(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask) slab_x++; cont_recvmin[1] = slab_x; if(ncont == 1) ncont++; } } for(rep = 0; rep < ncont; rep++) { sendmin = cont_sendmin[rep]; sendmax = cont_sendmax[rep]; recvmin = cont_recvmin[rep]; recvmax = cont_recvmax[rep]; /* prepare what we want to send */ if(sendmax - sendmin >= 0) { for(slab_x = sendmin; slab_x <= sendmax; slab_x++) { slab_xx = ((slab_x + PMGRID) % PMGRID) - first_slab_of_task[ThisTask]; for(slab_y = meshmin_list[3 * recvTask + 1] - 2; slab_y < meshmax_list[3 * recvTask + 1] + 4; slab_y++) { slab_yy = (slab_y + PMGRID) % PMGRID; for(slab_z = meshmin_list[3 * recvTask + 2] - 2; slab_z < meshmax_list[3 * recvTask + 2] + 4; slab_z++) { slab_zz = (slab_z + PMGRID) % PMGRID; forcegrid[((slab_x - sendmin) * recv_dimy + (slab_y - (meshmin_list[3 * recvTask + 1] - 2))) * recv_dimz + slab_z - (meshmin_list[3 * recvTask + 2] - 2)] = rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz]; } } } } if(level > 0) { MPI_Sendrecv(forcegrid, (sendmax - sendmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_B, workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_B, MPI_COMM_WORLD, &status); } else { memcpy(workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz, forcegrid, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real)); } } } } } dimx = meshmax[0] - meshmin[0] + 2; dimy = meshmax[1] - meshmin[1] + 2; dimz = meshmax[2] - meshmin[2] + 2; recv_dimx = meshmax[0] - meshmin[0] + 6; recv_dimy = meshmax[1] - meshmin[1] + 6; recv_dimz = meshmax[2] - meshmin[2] + 6; for(dim = 0; dim < 3; dim++) /* Calculate each component of the force. */ { /* get the force component by finite differencing the potential */ /* note: "workspace" now contains the potential for the local patch, plus a suffiently large buffer region */ for(x = 0; x < meshmax[0] - meshmin[0] + 2; x++) for(y = 0; y < meshmax[1] - meshmin[1] + 2; y++) for(z = 0; z < meshmax[2] - meshmin[2] + 2; z++) { xrr = xll = xr = xl = x; yrr = yll = yr = yl = y; zrr = zll = zr = zl = z; switch (dim) { case 0: xr = x + 1; xrr = x + 2; xl = x - 1; xll = x - 2; break; case 1: yr = y + 1; yl = y - 1; yrr = y + 2; yll = y - 2; break; case 2: zr = z + 1; zl = z - 1; zrr = z + 2; zll = z - 2; break; } forcegrid[(x * dimy + y) * dimz + z] = fac * ((4.0 / 3) * (workspace[((xl + 2) * recv_dimy + (yl + 2)) * recv_dimz + (zl + 2)] - workspace[((xr + 2) * recv_dimy + (yr + 2)) * recv_dimz + (zr + 2)]) - (1.0 / 6) * (workspace[((xll + 2) * recv_dimy + (yll + 2)) * recv_dimz + (zll + 2)] - workspace[((xrr + 2) * recv_dimy + (yrr + 2)) * recv_dimz + (zrr + 2)])); } /* read out the forces */ for(i = 0; i < NumPart; i++) { slab_x = to_slab_fac * P[i].Pos[0]; if(slab_x >= PMGRID) slab_x = PMGRID - 1; dx = to_slab_fac * P[i].Pos[0] - slab_x; slab_x -= meshmin[0]; slab_xx = slab_x + 1; slab_y = to_slab_fac * P[i].Pos[1]; if(slab_y >= PMGRID) slab_y = PMGRID - 1; dy = to_slab_fac * P[i].Pos[1] - slab_y; slab_y -= meshmin[1]; slab_yy = slab_y + 1; slab_z = to_slab_fac * P[i].Pos[2]; if(slab_z >= PMGRID) slab_z = PMGRID - 1; dz = to_slab_fac * P[i].Pos[2] - slab_z; slab_z -= meshmin[2]; slab_zz = slab_z + 1; acc_dim = forcegrid[(slab_x * dimy + slab_y) * dimz + slab_z] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz); acc_dim += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_z] * (1.0 - dx) * dy * (1.0 - dz); acc_dim += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_zz] * (1.0 - dx) * (1.0 - dy) * dz; acc_dim += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_zz] * (1.0 - dx) * dy * dz; acc_dim += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_z] * (dx) * (1.0 - dy) * (1.0 - dz); acc_dim += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_z] * (dx) * dy * (1.0 - dz); acc_dim += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_zz] * (dx) * (1.0 - dy) * dz; acc_dim += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_zz] * (dx) * dy * dz; P[i].GravPM[dim] = acc_dim; } } pm_init_periodic_free(); force_treeallocate(All.TreeAllocFactor * All.MaxPart, All.MaxPart); All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency; if(ThisTask == 0) { printf("done PM.\n"); fflush(stdout); } #ifdef FFTW3 if(fftw_plan_exists) { /* macro defined in callgrind.h */ // CALLGRIND_STOP_INSTRUMENTATION; } #else // CALLGRIND_STOP_INSTRUMENTATION; #endif }
//removes the lowest energy vertical seam from the image void removeVerticalSeam() { double energies[3]; double min_energy; int prev_x; int prev_y; // split up work between processes double *my_path_costs; double *my_previous_x; double *my_previous_y; double *temp_path_costs; double *temp_previous_x; double *temp_previous_y; int my_cols = current_width / numprocs; int low_cols = my_cols; int extra_cols = current_width % numprocs; int start; int x_offset; int recv_cols; double left_end_cost, right_end_cost, temp_end_cost; if (rank < extra_cols) { my_cols++; start = rank * my_cols; } else { start = (extra_cols * (my_cols + 1)) + ((rank - extra_cols) * my_cols); } //printf("%d %d %d\n", rank, start, my_cols); my_path_costs = (double *) malloc(my_cols * current_height * sizeof(double)); my_previous_x = (double *) malloc(my_cols * current_height * sizeof(double)); my_previous_y = (double *) malloc(my_cols * current_height * sizeof(double)); //find the lowest cost seam by computing the lowest cost paths to each pixel for (int y = 0; y < current_height; y++) { //compute the path costs for my columns for (int x = start; x < start + my_cols; x++) { //printf("%d %d %d %d %d\n", rank, x, y, (x - start) * current_height + y, my_cols * current_height); if (y == 0) { path_costs[x * initial_height] = image_energy[x * initial_height]; my_path_costs[(x - start) * current_height + y] = path_costs[x * initial_height]; previous_x[x * initial_height] = -1; my_previous_x[(x - start) * current_height + y] = previous_x[x * initial_height]; previous_y[x * initial_height] = -1; my_previous_y[(x - start) * current_height + y] = previous_y[x * initial_height]; } else { //the pixel directly above energies[1] = path_costs[x * initial_height + y - 1]; //pixel above to the left if (x != 0) { energies[0] = path_costs[(x - 1) * initial_height + y - 1]; } else { energies[0] = DBL_MAX; } //pixel above to the right if (x != current_width - 1) { energies[2] = path_costs[(x + 1) * initial_height + y - 1]; } else { energies[2] = DBL_MAX; } //find the one with the least path cost min_energy = energies[0]; prev_x = x - 1; prev_y = y - 1; if (energies[1] < min_energy) { min_energy = energies[1]; prev_x = x; } if (energies[2] < min_energy) { min_energy = energies[2]; prev_x = x + 1; } //set the minimum path cost for this pixel path_costs[x * initial_height + y] = min_energy + image_energy[x * initial_height + y]; my_path_costs[(x - start) * current_height + y] = path_costs[x * initial_height + y]; //set the previous pixel on the minimum path's coordinates for this pixel previous_x[x * initial_height + y] = prev_x; my_previous_x[(x - start) * current_height + y] = previous_x[x * initial_height + y]; previous_y[x * initial_height + y] = prev_y; my_previous_y[(x - start) * current_height + y] = previous_y[x * initial_height + y]; } } //send path cost needed to neighboring processes if (numprocs > 1) { if (rank != numprocs - 1) { //send rightmost cost to following process right_end_cost = path_costs[(start + my_cols - 1) * initial_height + y]; MPI_Send(&right_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD); //receive following process's leftmost cost MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); path_costs[(start + my_cols) * initial_height + y] = temp_end_cost; } if (rank != 0) { //send leftmost cost to preceding process left_end_cost = path_costs[start * initial_height + y]; MPI_Send(&left_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD); //receive preceding process's rightmost cost MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); path_costs[(start - 1) * initial_height + y] = temp_end_cost; } } } //update path costs and previous for all processes for (int i = 0; i < numprocs; i++) { if (rank == i) { continue; } if (i < extra_cols) { x_offset = i * (low_cols + 1); recv_cols = (low_cols + 1); } else { x_offset = (extra_cols * (low_cols + 1)) + ((i - extra_cols) * low_cols); recv_cols = low_cols; } //printf("%d %d\n", low_cols, extra_cols); //printf("%d %d %d\n", rank, x_offset, recv_cols); temp_path_costs = (double *) malloc(recv_cols * current_height * sizeof(double)); temp_previous_x = (double *) malloc(recv_cols * current_height * sizeof(double)); temp_previous_y = (double *) malloc(recv_cols * current_height * sizeof(double)); MPI_Sendrecv(my_path_costs, my_cols * current_height, MPI_DOUBLE, i, 0, temp_path_costs, recv_cols * current_height, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Sendrecv(my_previous_x, my_cols * current_height, MPI_DOUBLE, i, 1, temp_previous_x, recv_cols * current_height, MPI_DOUBLE, i, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Sendrecv(my_previous_y, my_cols * current_height, MPI_DOUBLE, i, 2, temp_previous_y, recv_cols * current_height, MPI_DOUBLE, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); /* problem is here */ for (int j = 0; j < recv_cols * current_height; j++) { int x = x_offset + (j % recv_cols); int y = j / recv_cols; //printf("%d %d %d %d %d\n", rank, x, y, x * initial_height + y, recv_cols * current_height); //printf("%d\n", initial_height * initial_width); path_costs[x * initial_height + y] = temp_path_costs[(x - x_offset) * current_height + y]; previous_x[x * initial_height + y] = temp_previous_x[(x - x_offset) * current_height + y]; previous_y[x * initial_height + y] = temp_previous_y[(x - x_offset) * current_height + y]; } free(temp_path_costs); free(temp_previous_x); free(temp_previous_y); } free(my_path_costs); free(my_previous_x); free(my_previous_y); //printf("here\n"); //find the xcoord the lowest cost seam starts at the bottom of the current image int x_coord = 0; for (int x = 0; x < current_width; x++) { if (path_costs[x * initial_height + current_height - 1] < path_costs[x_coord * initial_height + current_height - 1]) { x_coord = x; } } //printf("here\n"); //delete the seam from the bottom up for (int y = current_height - 1; y >= 0; y--) { //delete this pixel by copying over it and all those following to the right for (int x = x_coord; x < current_width - 1; x++) { image[x * initial_height + y] = image[(x + 1) * initial_height + y]; } //next pixel //printf("%d\n", x_coord * initial_height + y); x_coord = previous_x[x_coord * initial_height + y]; //printf("%d %d\n", rank, x_coord); } //decrease the current width of the image current_width--; }
void star_density(void) { int j; #ifdef EDDINGTON_TENSOR_STARS int i, dummy; int ngrp, sendTask, recvTask, place, nexport, nimport, ndone, ndone_flag; #endif /* clear Je in all gas particles */ for(j = 0; j < N_gas; j++) { if(P[j].Type == 0) SphP[j].Je = 0; #ifdef SFR if(P[j].Type == 0) { SphP[j].Je += SphP[j].Sfr * All.IonizingLumPerSFR * (PROTONMASS / (P[j].Mass * All.UnitMass_in_g / All.HubbleParam)) * All.UnitTime_in_s / All.HubbleParam; } #endif } #ifdef EDDINGTON_TENSOR_STARS /* allocate buffers to arrange communication */ Ngblist = (int *) mymalloc(NumPart * sizeof(int)); All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + 2 * sizeof(struct stardata_in))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); i = FirstActiveParticle; /* beginn with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i >= 0; i = NextActiveParticle[i]) { if(P[i].Type == 4) { if(star_density_evaluate(i, 0, &nexport, Send_count) < 0) break; } } #ifdef MYSORT mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } StarDataGet = (struct stardata_in *) mymalloc(nimport * sizeof(struct stardata_in)); StarDataIn = (struct stardata_in *) mymalloc(nexport * sizeof(struct stardata_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; StarDataIn[j].Pos[0] = P[place].Pos[0]; StarDataIn[j].Pos[1] = P[place].Pos[1]; StarDataIn[j].Pos[2] = P[place].Pos[2]; StarDataIn[j].Hsml = PPP[place].Hsml; StarDataIn[j].Density = P[place].DensAroundStar; StarDataIn[j].Mass = P[place].Mass; memcpy(StarDataIn[j].NodeList, DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int)); } /* exchange particle data */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&StarDataIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct stardata_in), MPI_BYTE, recvTask, TAG_DENS_A, &StarDataGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct stardata_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } myfree(StarDataIn); /* now do the particles that were sent to us */ for(j = 0; j < nimport; j++) star_density_evaluate(j, 1, &dummy, &dummy); /* check whether this is the last iteration */ if(i < 0) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); myfree(StarDataGet); } while(ndone < NTask); myfree(DataNodeList); myfree(DataIndexTable); myfree(Ngblist); #endif //for EDDINGTON_TENSOR_STARS }
int main( int argc, char **argv ) { MPI_Datatype *types; void **inbufs, **outbufs; char **names; int *counts, *bytesize, ntype; MPI_Comm comms[20]; int ncomm = 20, rank, np, partner, tag; int i, j, k, err, toterr, world_rank, errloc; MPI_Status status, statuses[2]; int flag, index; char *obuf; MPI_Request requests[2]; MPI_Init( &argc, &argv ); AllocateForData( &types, &inbufs, &outbufs, &counts, &bytesize, &names, &ntype ); GenerateData( types, inbufs, outbufs, counts, bytesize, names, &ntype ); MPI_Comm_rank( MPI_COMM_WORLD, &world_rank ); MakeComms( comms, 20, &ncomm, 0 ); /* Test over a wide range of datatypes and communicators */ err = 0; for (i=0; i<ncomm; i++) { MPI_Comm_rank( comms[i], &rank ); MPI_Comm_size( comms[i], &np ); if (np < 2) continue; tag = i; for (j=0; j<ntype; j++) { if (world_rank == 0){ /* SI make size of outputindependent of number of processes */ if (i<2) fprintf( stdout, "Testing type %s\n",names[j] ); } /* This test does an irsend between both partners, with a sendrecv after the irecv used to guarentee that the irsend has a matching receive */ if (rank == 0) { partner = np - 1; #if 0 MPIR_PrintDatatypePack( stdout, counts[j], types[j], 0, 0 ); #endif obuf = outbufs[j]; for (k=0; k<bytesize[j]; k++) obuf[k] = 0; MPI_Irecv(outbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[0] ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, comms[i], &status ); MPI_Irsend( inbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[1] ); do { MPI_Waitany( 2, requests, &index, &status ); } while (index != 0); /* Always the possiblity that the Irsend is still waiting */ MPI_Waitall( 2, requests, statuses ); if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) { char *p1, *p2; fprintf( stderr, "Error in data with type %s (type %d on %d) at byte %d\n", names[j], j, world_rank, errloc - 1 ); p1 = (char *)inbufs[j]; p2 = (char *)outbufs[j]; fprintf( stderr, "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] ); err++; #if 0 MPIR_PrintDatatypeUnpack( stderr, counts[j], types[j], 0, 0 ); #endif } } else if (rank == np - 1) { partner = 0; obuf = outbufs[j]; for (k=0; k<bytesize[j]; k++) obuf[k] = 0; MPI_Irecv(outbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[0] ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, MPI_BOTTOM, 0, MPI_INT, partner, ncomm+i, comms[i], &status ); /* Wait for irecv to complete */ do { MPI_Test( &requests[0], &flag, &status ); } while (!flag); if ((errloc = CheckData( inbufs[j], outbufs[j], bytesize[j] ))) { char *p1, *p2; fprintf( stderr, "Error in data with type %s (type %d on %d) at byte %d\n", names[j], j, world_rank, errloc - 1 ); p1 = (char *)inbufs[j]; p2 = (char *)outbufs[j]; fprintf( stderr, "Got %x expected %x\n", p1[errloc-1], p2[errloc-1] ); err++; #if 0 MPIR_PrintDatatypeUnpack( stderr, counts[j], types[j], 0, 0 ); #endif } MPI_Irsend( inbufs[j], counts[j], types[j], partner, tag, comms[i], &requests[1] ); MPI_Waitall(1, &requests[1], &status ); } } } if (err > 0) { fprintf( stderr, "%d errors on %d\n", err, rank ); } MPI_Allreduce( &err, &toterr, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); if (world_rank == 0) { if (toterr == 0) { printf( " No Errors\n" ); } else { printf (" Found %d errors\n", toterr ); } } FreeDatatypes( types, inbufs, outbufs, counts, bytesize, names, ntype ); FreeComms( comms, ncomm ); MPI_Finalize(); return err; }
int main (int argc, char *argv[]) { int my_rank, size; int right, left; int int_send_buf, int_recv_buf, int_sum, i; float float_send_buf, float_recv_buf, float_sum; int array_of_blocklengths[COUNT]; MPI_Aint array_of_displacements[COUNT], first_var_address, second_var_address; MPI_Datatype array_of_types[COUNT], sendtype, recvtype; MPI_Status status; /* Get process and neighbour info. */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &size); right = (my_rank+1) % size; left = (my_rank-1+size) % size; /* ... this SPMD-style neighbor computation with modulo has the same meaning as: */ /* right = my_rank + 1; */ /* if (right == size) right = 0; */ /* left = my_rank - 1; */ /* if (left == -1) left = size-1;*/ /* Set MPI datatypes for sending and receiving partial sums. */ array_of_blocklengths[0] = 1; array_of_blocklengths[1] = 1; MPI_Address(&int_send_buf, &first_var_address); MPI_Address(&float_send_buf, &second_var_address); array_of_displacements[0] = (MPI_Aint) 0; array_of_displacements[1] = second_var_address - first_var_address; array_of_types[0] = MPI_INT; array_of_types[1] = MPI_FLOAT; MPI_Type_struct(COUNT, array_of_blocklengths, array_of_displacements, array_of_types, &sendtype); MPI_Type_commit(&sendtype); MPI_Address(&int_recv_buf, &first_var_address); MPI_Address(&float_recv_buf, &second_var_address); array_of_displacements[0] = (MPI_Aint) 0; array_of_displacements[1] = second_var_address - first_var_address; MPI_Type_struct(COUNT, array_of_blocklengths, array_of_displacements, array_of_types, &recvtype); MPI_Type_commit(&recvtype); /* Compute global sum. */ int_sum = 0; float_sum = 0; int_send_buf = my_rank; float_send_buf = (float) my_rank; for( i = 0; i < size; i++) { MPI_Sendrecv(&int_send_buf, 1, sendtype, right, to_right, &int_recv_buf, 1, recvtype, left, to_right, MPI_COMM_WORLD, &status); int_send_buf = int_recv_buf; float_send_buf = float_recv_buf; int_sum += int_recv_buf; float_sum += float_recv_buf; } printf ("PE%i:\tSum = %i\t%f\n", my_rank, int_sum, float_sum); MPI_Finalize(); }
static int kmr_alltoall_bruck(KMR *mr, void *sbuf, void *rbuf, int cnt) { #define DUMP_(X0,X1,X2,X3,X4) if (tracing) kmr_atoa_dump_(X0,X1,X2,X3,X4) MPI_Comm comm = mr->comm; int nprocs = mr->nprocs; int rank = mr->rank; int tag = KMR_TAG_ATOA; _Bool tracing = mr->trace_alltoall; assert((nprocs & 3) == 0); int nprocs4th = (nprocs / 4); int cc; int lognprocs = 0; while ((1 << lognprocs) < nprocs) { lognprocs++; } assert((1 << lognprocs) == nprocs); char *buf0 = kmr_malloc((size_t)(cnt * nprocs)); char *buf1 = kmr_malloc((size_t)(cnt * nprocs)); memcpy(buf0, sbuf, (size_t)(cnt * nprocs)); MPI_Request rqs[6]; for (int stage = 0; stage < lognprocs; stage += 2) { DUMP_(mr, buf0, cnt, "step", stage); for (int j = 0; j < nprocs4th; j++) { for (int i = 0; i < 4; i++) { void *s = &buf0[cnt * (i + (j * 4))]; void *r = &buf1[cnt * (nprocs4th * i + j)]; memcpy(r, s, (size_t)cnt); } } DUMP_(mr, buf1, cnt, "pack", stage); for (int k = 0; k < 4; k++) { int flip = (k << stage); int peer = (rank ^ flip); int baserank = ((rank >> stage) & 3); int basepeer = ((peer >> stage) & 3); if (k == 0) { void *s = &buf1[cnt * (baserank * nprocs4th)]; void *r = &buf0[cnt * (baserank * nprocs4th)]; memcpy(r, s, (size_t)(cnt * nprocs4th)); } else { void *s = &buf1[cnt * (basepeer * nprocs4th)]; void *r = &buf0[cnt * (basepeer * nprocs4th)]; #if 0 cc = MPI_Sendrecv(s, (cnt * nprocs4th), MPI_BYTE, peer, tag, r, (cnt * nprocs4th), MPI_BYTE, peer, tag, comm, MPI_STATUS_IGNORE); assert(cc == MPI_SUCCESS); #else cc = MPI_Isend(s, (cnt * nprocs4th), MPI_BYTE, peer, tag, comm, &rqs[(k - 1) * 2 + 1]); assert(cc == MPI_SUCCESS); cc = MPI_Irecv(r, (cnt * nprocs4th), MPI_BYTE, peer, tag, comm, &rqs[(k - 1) * 2]); assert(cc == MPI_SUCCESS); #endif } } cc = MPI_Waitall(6, rqs, MPI_STATUSES_IGNORE); assert(cc == MPI_SUCCESS); DUMP_(mr, buf0, cnt, "exchange", stage); } memcpy(rbuf, buf0, (size_t)(cnt * nprocs)); kmr_free(buf0, (size_t)(cnt * nprocs)); kmr_free(buf1, (size_t)(cnt * nprocs)); return MPI_SUCCESS; }
int main(int argc, char** argv) { int iter_max = 1000; const float pi = 2.0 * asinf(1.0f); const float tol = 1.0e-5f; int rank = 0; int size = 1; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); memset(A, 0, N * M * sizeof(float)); memset(Aref, 0, N * M * sizeof(float)); // set boundary conditions for (int j = 0; j < N; j++) { float y0 = sinf( 2.0 * pi * j / (N-1)); A[j][0] = y0; A[j][M-1] = y0; Aref[j][0] = y0; Aref[j][M-1] = y0; } #if _OPENACC int ngpus=acc_get_num_devices(acc_device_nvidia); int devicenum=rank%ngpus; acc_set_device_num(devicenum,acc_device_nvidia); // Call acc_init after acc_set_device_num to avoid multiple contexts on device 0 in multi GPU systems acc_init(acc_device_nvidia); #endif /*_OPENACC*/ // Ensure correctness if N%size != 0 int chunk_size = ceil( (1.0*N)/size ); int jstart = rank * chunk_size; int jend = jstart + chunk_size; // Do not process boundaries jstart = max( jstart, 1 ); jend = min( jend, N - 1 ); if ( rank == 0) printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M); if ( rank == 0) printf("Calculate reference solution and time serial execution.\n"); StartTimer(); laplace2d_serial( rank, iter_max, tol ); double runtime_serial = GetTimer(); //Wait for all processes to ensure correct timing of the parallel version MPI_Barrier( MPI_COMM_WORLD ); if ( rank == 0) printf("Parallel execution.\n"); StartTimer(); int iter = 0; float error = 1.0f; #pragma acc data copy(A) create(Anew) while ( error > tol && iter < iter_max ) { error = 0.f; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { Anew[j][i] = 0.25f * ( A[j][i+1] + A[j][i-1] + A[j-1][i] + A[j+1][i]); error = fmaxf( error, fabsf(Anew[j][i]-A[j][i])); } } float globalerror = 0.0f; MPI_Allreduce( &error, &globalerror, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD ); error = globalerror; #pragma acc kernels for (int j = jstart; j < jend; j++) { for( int i = 1; i < M-1; i++ ) { A[j][i] = Anew[j][i]; } } //Periodic boundary conditions int top = (rank == 0) ? (size-1) : rank-1; int bottom = (rank == (size-1)) ? 0 : rank+1; #pragma acc host_data use_device( A ) { //1. Sent row jstart (first modified row) to top receive lower boundary (jend) from bottom MPI_Sendrecv( A[jstart], M, MPI_FLOAT, top , 0, A[jend], M, MPI_FLOAT, bottom, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); //2. Sent row (jend-1) (last modified row) to bottom receive upper boundary (jstart-1) from top MPI_Sendrecv( A[(jend-1)], M, MPI_FLOAT, bottom, 0, A[(jstart-1)], M, MPI_FLOAT, top , 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE ); } if(rank == 0 && (iter % 100) == 0) printf("%5d, %0.6f\n", iter, error); iter++; } MPI_Barrier( MPI_COMM_WORLD ); double runtime = GetTimer(); if (check_results( rank, jstart, jend, tol ) && rank == 0) { printf( "Num GPUs: %d\n", size ); printf( "%dx%d: 1 GPU: %8.4f s, %d GPUs: %8.4f s, speedup: %8.2f, efficiency: %8.2f%\n", N,M, runtime_serial/ 1000.f, size, runtime/ 1000.f, runtime_serial/runtime, runtime_serial/(size*runtime)*100 ); } MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { MPI_Status status; /* MPI status */ int mpierr; /* MPI function return code */ int rank; /* Process rank within MPI_COMM_WORLD */ int size; int dest, src; int tag0=41; /* MPI message tag */ int inject; int report; int iterations; int n_bytes; unsigned char* send_buff; unsigned char* recv_buff; char* tmp; int i, j, count; float fraction, randval; struct timeval tp; if (1 < argc) { if (0 == strncmp(argv[1], "-h", 2) || 0 == strncmp(argv[1], "--h", 3)) { printf("Usage: mpirun --options-- ./sendrecv_blaster <options> where options are:\n" "\tpattern=[self | pair | ring] where\n" "\t\tself => sendrecv with self\n" "\t\tpair => sendrecv with a complementary partner [0 <-> N-1, 1 <-> N-2...]\n" "\t\tring [default] => sendrecv around a ring [0 recvs from N-1 and sends to 1]\n" "\tsize=[value < 0 => max message size in kbytes, value > 0 => max message size in Mbytes (default=1MByte)]\n" "\tinject=[value = #iterations before injecting MPI_Sendrecv to self (default: never)]\n" "\treport=[value = #iterations/reporting point (default: 1000)\n" "\titerations=[value = #iterations before stopping (default: 1000000)\n"); return 0; } } mpierr = MPI_Init(&argc, &argv); if (mpierr != MPI_SUCCESS) { fprintf(stderr, "MPI Error %d (MPI_Init)\n",mpierr); fflush(stderr); MPI_Abort(MPI_COMM_WORLD, -1); } MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN); mpierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (mpierr != MPI_SUCCESS || rank < 0) { fprintf(stderr, "MPI Error %d (MPI_Comm_rank)\n",mpierr); fflush(stderr); MPI_Abort(MPI_COMM_WORLD, -1); } mpierr = MPI_Comm_size(MPI_COMM_WORLD, &size); if (mpierr != MPI_SUCCESS || size < 0) { fprintf(stderr, "MPI Error %d (MPI_Comm_size)\n",mpierr); fflush(stderr); MPI_Abort(MPI_COMM_WORLD, -1); } /* setup defaults in lieu of args */ n_bytes = 1024*1024; inject = -1; report = 1000; iterations = 1000000; /* do a ring */ src = rank - 1; if (src < 0) { src = size - 1; } dest = rank + 1; if (dest > size-1) { dest = 0; } for (i=1; i < argc; i++) { fprintf(stderr, "got %s\n", argv[i]); if (0 == strncmp(argv[i], "pattern", strlen("pattern"))) { tmp = strchr(argv[i], '='); tmp++; if (0 == strcmp(tmp, "self")) { /* just do it with myself */ src = rank; dest = rank; } else if (0 == strcmp(tmp, "pair")) { /* do it pair-wise */ src = (size-1) - rank; dest = src; } else { /* do a ring */ src = rank - 1; if (src < 0) { src = size - 1; } dest = rank + 1; if (dest > size-1) { dest = 0; } } } else if (0 == strncmp(argv[i], "size", strlen("size"))) { tmp = strchr(argv[i], '='); tmp++; n_bytes = atoi(tmp); if (n_bytes < 0) { n_bytes = -1 * n_bytes * 1024; } else { n_bytes = n_bytes * 1024*1024; } } else if (0 == strncmp(argv[i], "inject", strlen("inject"))) { tmp = strchr(argv[i], '='); tmp++; inject = atoi(tmp); } else if (0 == strncmp(argv[i], "report", strlen("report"))) { tmp = strchr(argv[i], '='); tmp++; report = atoi(tmp); } else if (0 == strncmp(argv[i], "iter", strlen("iter"))) { tmp = strchr(argv[i], '='); tmp++; iterations = atoi(tmp); } } send_buff = (unsigned char *) valloc(n_bytes); recv_buff = (unsigned char *) valloc(n_bytes); /* seed the random number generator */ gettimeofday (&tp, NULL); srand (tp.tv_usec); for ( i=0; i<n_bytes; i++ ) { send_buff[i] = i%128; } fprintf(stderr, "Rank %d: recving from src %d sending to dest %d with max buff size %dKbytes\n", rank, src, dest, n_bytes/1024); i=0; while (i < iterations) { randval = rand(); fraction = randval/RAND_MAX; count = fraction * n_bytes; mpierr = MPI_Sendrecv(send_buff, count, MPI_CHAR, dest, tag0, recv_buff, n_bytes, MPI_CHAR, src, tag0, MPI_COMM_WORLD, &status); if (mpierr != MPI_SUCCESS) { fprintf(stderr,"MPI Error %d (MPI_Sendrecv) [%d,%d] at iteration %d\n",mpierr,src,dest,i); fflush(stderr); MPI_Abort(MPI_COMM_WORLD, -1); } i++; if (0 == (i % report)) { fprintf(stderr, "Rank %d has completed %dk iterations\n", rank, i/1000); } if (0 < inject && 0 == (i % inject)) { mpierr = MPI_Sendrecv(send_buff, count, MPI_CHAR, rank, tag0, recv_buff, n_bytes, MPI_CHAR, rank, tag0, MPI_COMM_WORLD, &status); if (mpierr != MPI_SUCCESS) { fprintf(stderr,"MPI Error %d (MPI_Sendrecv) [%d,%d] at iteration %d\n",mpierr,rank,rank,i); fflush(stderr); MPI_Abort(MPI_COMM_WORLD, -1); } else { fprintf(stderr, "Rank %d has completed MPI_Sendrecv with myself\n", rank); } } } fprintf(stderr, "Rank %d completed test\n", rank); MPI_Finalize(); }
/*! This function computes the local density for each active SPH particle, * the number of neighbours in the current smoothing radius, and the * divergence and curl of the velocity field. The pressure is updated as * well. If a particle with its smoothing region is fully inside the * local domain, it is not exported to the other processors. The function * also detects particles that have a number of neighbours outside the * allowed tolerance range. For these particles, the smoothing length is * adjusted accordingly, and the density computation is executed again. * Note that the smoothing length is not allowed to fall below the lower * bound set by MinGasHsml. */ void density(void) { long long ntot, ntotleft; int *noffset, *nbuffer, *nsend, *nsend_local, *numlist, *ndonelist; int i, j, n, ndone, npleft, maxfill, source, iter = 0; int level, ngrp, sendTask, recvTask, place, nexport; double dt_entr, tstart, tend, tstart_ngb = 0, tend_ngb = 0; double sumt, sumcomm, timengb, sumtimengb; double timecomp = 0, timeimbalance = 0, timecommsumm = 0, sumimbalance; MPI_Status status; double a3; #ifdef METALS_TG int metal_disperse; long long ntotsave; double a, hubble_param, hubble_a, dt, exp_func, old_met, old_met_tot, new_met, new_met_tot, M_metals_local, M_metals_tot; if(All.ComovingIntegrationOn) { a = All.Time; hubble_param = All.HubbleParam; hubble_a = All.Omega0 / (All.Time * All.Time * All.Time) + (1 - All.Omega0 - All.OmegaLambda) / (All.Time * All.Time) + All.OmegaLambda; hubble_a = All.Hubble * sqrt(hubble_a); } else a = hubble_a = hubble_param = 1.0; #endif if(All.ComovingIntegrationOn) { a3 = All.Time * All.Time * All.Time; } else a3 = 1.0; #ifdef PERIODIC boxSize = All.BoxSize; boxHalf = 0.5 * All.BoxSize; #ifdef LONG_X boxHalf_X = boxHalf * LONG_X; boxSize_X = boxSize * LONG_X; #endif #ifdef LONG_Y boxHalf_Y = boxHalf * LONG_Y; boxSize_Y = boxSize * LONG_Y; #endif #ifdef LONG_Z boxHalf_Z = boxHalf * LONG_Z; boxSize_Z = boxSize * LONG_Z; #endif #endif noffset = malloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = malloc(sizeof(int) * NTask); nsend_local = malloc(sizeof(int) * NTask); nsend = malloc(sizeof(int) * NTask * NTask); ndonelist = malloc(sizeof(int) * NTask); for(n = 0, NumSphUpdate = 0; n < N_gas; n++) { if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/ continue; SphP[n].Left = SphP[n].Right = 0; if(P[n].Ti_endstep == All.Ti_Current) NumSphUpdate++; } numlist = malloc(NTask * sizeof(int) * NTask); MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); #ifdef METALS_TG ntotsave = ntot; M_metals_tot = M_metals_local = 0.0; for(metal_disperse = 0; metal_disperse < 2; metal_disperse++) { ntot = ntotsave; #endif /* we will repeat the whole thing for those particles where we didn't * find enough neighbours */ do { i = 0; /* begin with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ tstart = second(); for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeDensity - NTask; i++) { if(P[i].ID < 0 || SphP[i].sink > 0.5) /*SINK*/ continue; if(P[i].Ti_endstep == All.Ti_Current) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; #ifdef METALS_TG density_evaluate(i, 0, metal_disperse); #else density_evaluate(i, 0); #endif for(j = 0; j < NTask; j++) { if(Exportflag[j]) { DensDataIn[nexport].Pos[0] = P[i].Pos[0]; DensDataIn[nexport].Pos[1] = P[i].Pos[1]; DensDataIn[nexport].Pos[2] = P[i].Pos[2]; DensDataIn[nexport].Vel[0] = SphP[i].VelPred[0]; DensDataIn[nexport].Vel[1] = SphP[i].VelPred[1]; DensDataIn[nexport].Vel[2] = SphP[i].VelPred[2]; DensDataIn[nexport].bfield[0] = SphP[i].bfield[0]; DensDataIn[nexport].bfield[1] = SphP[i].bfield[1]; DensDataIn[nexport].bfield[2] = SphP[i].bfield[2]; DensDataIn[nexport].Hsml = SphP[i].Hsml; DensDataIn[nexport].Sci = SphP[i].Sci; #ifdef METALS_TG DensDataIn[nexport].Sigma = SphP[i].Sigma; #endif DensDataIn[nexport].Index = i; DensDataIn[nexport].Task = j; nexport++; nsend_local[j]++; } } } } tend = second(); timecomp += timediff(tstart, tend); qsort(DensDataIn, nexport, sizeof(struct densdata_in), dens_compare_key); for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; tstart = second(); MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeDensity) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&DensDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct densdata_in), MPI_BYTE, recvTask, TAG_DENS_A, &DensDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); tstart = second(); for(j = 0; j < nbuffer[ThisTask]; j++) #ifdef METALS_TG density_evaluate(j, 1, metal_disperse); #else density_evaluate(j, 1); #endif tend = second(); timecomp += timediff(tstart, tend); /* do a block to explicitly measure imbalance */ tstart = second(); MPI_Barrier(MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* get the result */ tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeDensity) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&DensDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_out), MPI_BYTE, recvTask, TAG_DENS_B, &DensDataPartialResult[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct densdata_out), MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { source = j + noffset[recvTask]; place = DensDataIn[source].Index; #ifdef METALS_TG if(metal_disperse == 1) { SphP[place].const_A += DensDataPartialResult[source].const_A; SphP[place].const_B += DensDataPartialResult[source].const_B; } else { #endif SphP[place].NumNgb += DensDataPartialResult[source].Ngb; SphP[place].Density += DensDataPartialResult[source].Rho; SphP[place].DivVel += DensDataPartialResult[source].Div; SphP[place].DivB += DensDataPartialResult[source].Div_B; SphP[place].DhsmlDensityFactor += DensDataPartialResult[source].DhsmlDensity; SphP[place].Rot[0] += DensDataPartialResult[source].Rot[0]; SphP[place].Rot[1] += DensDataPartialResult[source].Rot[1]; SphP[place].Rot[2] += DensDataPartialResult[source].Rot[2]; SphP[place].GradSci[0] += DensDataPartialResult[source].GSci[0]; SphP[place].GradSci[1] += DensDataPartialResult[source].GSci[1]; SphP[place].GradSci[2] += DensDataPartialResult[source].GSci[2]; SphP[place].VelRel[0] += DensDataPartialResult[source].VRel[0]; SphP[place].VelRel[1] += DensDataPartialResult[source].VRel[1]; SphP[place].VelRel[2] += DensDataPartialResult[source].VRel[2]; #ifdef METALS_TG SphP[place].Sigma += DensDataPartialResult[source].Sigma; } #endif } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); level = ngrp - 1; } MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; } /* do final operations on results */ tstart = second(); #ifdef METALS_TG if(metal_disperse == 0) { #endif for(i = 0, npleft = 0; i < N_gas; i++) { if(P[i].ID < 0 || SphP[i].sink > 0.5) /*SINK*/ continue; if(P[i].Ti_endstep == All.Ti_Current) { { SphP[i].DhsmlDensityFactor = 1.0 / (1.0 + SphP[i].Hsml * SphP[i].DhsmlDensityFactor / ((double)(NUMDIMS) * SphP[i].Density)); SphP[i].CurlVel = sqrt(SphP[i].Rot[0] * SphP[i].Rot[0] + SphP[i].Rot[1] * SphP[i].Rot[1] + SphP[i].Rot[2] * SphP[i].Rot[2]) / SphP[i].Density; SphP[i].DivVel /= SphP[i].Density; SphP[i].DivB /= SphP[i].Density; SphP[i].GradSci[0] /= SphP[i].Density; SphP[i].GradSci[1] /= SphP[i].Density; SphP[i].GradSci[2] /= SphP[i].Density; SphP[i].VelRel[0] /= SphP[i].Density; SphP[i].VelRel[1] /= SphP[i].Density; SphP[i].VelRel[2] /= SphP[i].Density; #ifdef POLYTROPE SphP[i].Pressure = get_pressure(SphP[i].Density); #else /* POLYTROPE */ dt_entr = (double)((All.Ti_Current - (P[i].Ti_begstep + P[i].Ti_endstep) / 2.0)) * All.Timebase_interval; #ifdef CHEMCOOL SphP[i].Pressure = (SphP[i].Entropy + SphP[i].DtEntropy * dt_entr) * pow(SphP[i].Density, SphP[i].Gamma); #else /* CHEMCOOL */ SphP[i].Pressure = (SphP[i].Entropy + SphP[i].DtEntropy * dt_entr) * pow(SphP[i].Density, GAMMA); #endif /* CHEMCOOL */ #endif /* POLYTROPE */ } /* now check whether we had enough neighbours */ if(SphP[i].NumNgb < (All.DesNumNgb - All.MaxNumNgbDeviation) || (SphP[i].NumNgb > (All.DesNumNgb + All.MaxNumNgbDeviation) && SphP[i].Hsml > (1.01 * All.MinGasHsml))) { /* need to redo this particle */ npleft++; if(SphP[i].Left > 0 && SphP[i].Right > 0) if((SphP[i].Right - SphP[i].Left) < 1.0e-3 * SphP[i].Left) { /* this one should be ok */ npleft--; P[i].Ti_endstep = -P[i].Ti_endstep - 1; /* Mark as inactive */ continue; } if(SphP[i].NumNgb < (All.DesNumNgb - All.MaxNumNgbDeviation)) SphP[i].Left = dmax(SphP[i].Hsml, SphP[i].Left); else { if(SphP[i].Right != 0) { if(SphP[i].Hsml < SphP[i].Right) SphP[i].Right = SphP[i].Hsml; } else SphP[i].Right = SphP[i].Hsml; } if(iter >= MAXITER - 10) { printf ("i=%d task=%d ID=%d Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n pos=(%g|%g|%g)\n", i, ThisTask, (int) P[i].ID, SphP[i].Hsml, SphP[i].Left, SphP[i].Right, (float) SphP[i].NumNgb, SphP[i].Right - SphP[i].Left, P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]); fflush(stdout); } if(SphP[i].Right > 0 && SphP[i].Left > 0) SphP[i].Hsml = pow(0.5 * (pow(SphP[i].Left, 3) + pow(SphP[i].Right, 3)), 1.0 / 3.0); else { if(SphP[i].Right == 0 && SphP[i].Left == 0) endrun(8188); /* can't occur */ if(SphP[i].Right == 0 && SphP[i].Left > 0) { if(P[i].Type == 0 && fabs(SphP[i].NumNgb - All.DesNumNgb) < 0.5 * All.DesNumNgb) { SphP[i].Hsml *= 1.0 - (SphP[i].NumNgb - All.DesNumNgb) / ((double)(NUMDIMS) * SphP[i].NumNgb) * SphP[i].DhsmlDensityFactor; } else SphP[i].Hsml *= 1.26; } if(SphP[i].Right > 0 && SphP[i].Left == 0) { if(P[i].Type == 0 && fabs(SphP[i].NumNgb - All.DesNumNgb) < 0.5 * All.DesNumNgb) { SphP[i].Hsml *= 1.0 - (SphP[i].NumNgb - All.DesNumNgb) / ((double)(NUMDIMS) * SphP[i].NumNgb) * SphP[i].DhsmlDensityFactor; } else SphP[i].Hsml /= 1.26; } } if(SphP[i].Hsml < All.MinGasHsml) SphP[i].Hsml = All.MinGasHsml; } else P[i].Ti_endstep = -P[i].Ti_endstep - 1; /* Mark as inactive */ } } #ifdef METALS_TG } else { ntot = 0; } #endif tend = second(); timecomp += timediff(tstart, tend); #ifdef METALS_TG if(metal_disperse == 0) { #endif numlist = malloc(NTask * sizeof(int) * NTask); MPI_Allgather(&npleft, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; free(numlist); if(ntot > 0) { if(iter == 0) tstart_ngb = second(); iter++; if(iter > 0 && ThisTask == 0) { printf("ngb iteration %d: need to repeat for %d%09d particles.\n", iter, (int) (ntot / 1000000000), (int) (ntot % 1000000000)); fflush(stdout); } if(iter > MAXITER) { printf("failed to converge in neighbour iteration in density()\n"); fflush(stdout); endrun(1155); } } else tend_ngb = second(); #ifdef METALS_TG } #endif } while(ntot > 0); /* mark as active again */ for(i = 0; i < NumPart; i++) { if(P[i].Type == 0 && P[i].ID < 0) /*ACCRETED*/ continue; if(P[i].Ti_endstep < 0) P[i].Ti_endstep = -P[i].Ti_endstep - 1; } #ifdef METALS_TG if(metal_disperse == 0) { for(n = 0; n < N_gas; n++) { if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/ continue; if(P[n].Ti_endstep == All.Ti_Current) { SphP[n].Sigma = sqrt(a*SphP[n].Sigma/SphP[n].NumNgb); } } } } old_met = old_met_tot = new_met = new_met_tot = 0.0; for(n = 0; n < N_gas; n++) { if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/ continue; if(P[n].Ti_endstep == All.Ti_Current) { dt = (All.Ti_Current-P[n].Ti_begstep)*All.Timebase_interval/hubble_a; old_met += SphP[n].Metallicity*P[n].Mass; exp_func = exp(-SphP[n].const_A*dt); SphP[n].Metallicity *= exp_func; SphP[n].Metallicity += SphP[n].const_B/SphP[n].const_A*(1.0-exp_func); new_met += SphP[n].Metallicity*P[n].Mass; } } MPI_Allreduce(&old_met, &old_met_tot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(&new_met, &new_met_tot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); for(n = 0; n < N_gas; n++) { if(P[n].ID < 0 || SphP[n].sink > 0.5) /*SINK*/ continue; if(P[n].Ti_endstep == All.Ti_Current && new_met_tot > 0.0) { SphP[n].Metallicity *= old_met_tot/new_met_tot; } M_metals_local += SphP[n].Metallicity*P[n].Mass; } MPI_Allreduce(&M_metals_local, &M_metals_tot, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if(ThisTask == 0) printf("Metal mass = %g\n", M_metals_tot*All.UnitMass_in_g/SOLAR_MASS*Z_SOLAR/hubble_param); #endif free(ndonelist); free(nsend); free(nsend_local); free(nbuffer); free(noffset); /* collect some timing information */ if(iter > 0) timengb = timediff(tstart_ngb, tend_ngb); else timengb = 0; MPI_Reduce(&timengb, &sumtimengb, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timeimbalance, &sumimbalance, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if(ThisTask == 0) { All.CPU_HydCompWalk += sumt / NTask; All.CPU_HydCommSumm += sumcomm / NTask; All.CPU_HydImbalance += sumimbalance / NTask; All.CPU_EnsureNgb += sumtimengb / NTask; } }
int main(int argc, char *argv[] ) { double time1, time2; time1 = MPI_Wtime(); int rank, processors; int j; // number of iterations int k; // number of iterations to perform before creating a checkpoint int l; // number of random samples per grid point int checkpoint_resume = 0; // 1 = resume from last checkpoint int c; // used to hold a character int i=0, row = 0, col = 0, pln = 0; // array iterators char ***local_array; char **local_array_2nd; char *local_array_pointer; char ***local_array_copy; char **local_array_copy_2nd; char *local_array_copy_pointer; char ***temp, *temp_pointer; int file_open_error; int command_line_incomplete = 0; int grid_size[3] = {0,0,0}; int proc_size[3] = {0,0,0}; int local_size[3] = {0,0,0}; int remainder_size[3] = {0,0,0}; int coords[3] = {0,0,0}; int start_indices[3] = {0,0,0}; int periods[3] = {0,0,0}; int mem_size[3] = {0,0,0}; MPI_Status status; MPI_Datatype filetype, memtype; MPI_File fh; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &processors); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Interpret the command line arguments -------------------------------- if (rank == 0) { if (argc < 6 || argc > 8) { fputs("usage: x y z j k l r\n", stderr); fputs("where: x,y,z = x, y and z dimensions\n", stderr); fputs(" j = how many times the game of life is played\n", stderr); fputs(" k = checkpoint every k iterations\n", stderr); fputs(" l = number of random samples per grid point\n", stderr); fputs(" r = resume from the last checkpoint\n", stderr); fputs(INITIAL, stderr); fputs(" must be present.\n", stderr); fputs(CHECKPOINT, stderr); fputs(" must be present if resuming from the last checkpoint.\n", stderr); exit(EXIT_FAILURE); } } j = (int) strtol(argv[4], NULL, 10); k = (int) strtol(argv[5], NULL, 10); l = (int) strtol(argv[6], NULL, 10); if ( argc == 7 ) if ( argv[6][0] == 'r' ) checkpoint_resume = 1; if (rank == 0) printf("%d iterations \ncheckpoint every %d iterations \n%d samples per grid point \ncheckpoint resume = %d\n", j,k,l,checkpoint_resume); grid_size[0] = (int) strtol(argv[1], NULL, 10); grid_size[1] = (int) strtol(argv[2], NULL, 10); grid_size[2] = (int) strtol(argv[3], NULL, 10); if (rank==0) printf("grid_size: %d, %d, %d\n", grid_size[0], grid_size[1], grid_size[2]); MPI_Dims_create(processors, 3, proc_size); if (rank==0) printf("proc_size: %d, %d, %d\n", proc_size[0], proc_size[1], proc_size[2]); local_size[0] = grid_size[0] / proc_size[0]; local_size[1] = grid_size[1] / proc_size[1]; local_size[2] = grid_size[2] / proc_size[2]; if (rank==0) printf("local_size: %d, %d, %d\n", local_size[0], local_size[1], local_size[2]); remainder_size[0] = grid_size[0] % proc_size[0]; remainder_size[1] = grid_size[1] % proc_size[1]; remainder_size[2] = grid_size[2] % proc_size[2]; if (rank==0) printf("remainder_size: %d, %d, %d\n", remainder_size[0], remainder_size[1], remainder_size[2]); if (remainder_size[0] != 0 || remainder_size[1] != 0 || remainder_size[2] != 0) { fputs("remainder size != 0, check your dimensions", stderr); MPI_Finalize(); exit(EXIT_FAILURE); } MPI_Comm comm; MPI_Cart_create(MPI_COMM_WORLD, 3, proc_size, periods, 0, &comm); MPI_Comm_rank(comm, &rank); MPI_Cart_coords(comm, rank, 3, coords); start_indices[0] = coords[0] * local_size[0]; start_indices[1] = coords[1] * local_size[1]; start_indices[2] = coords[2] * local_size[2]; /* printf("A coords R%d: (%d, %d, %d) (%d, %d, %d)\n", rank, coords[0], coords[1], coords[2], start_indices[0], start_indices[1], start_indices[2]);*/ fflush(stdout); // create the file type --------------------------------------------------- MPI_Type_create_subarray(3, grid_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &filetype); MPI_Type_commit(&filetype); // create a local memory type with ghost rows ----------------------------- mem_size[0] = local_size[0] + 2; mem_size[1] = local_size[1] + 2; mem_size[2] = local_size[2] + 2; start_indices[0] = start_indices[1] = start_indices[2] = 1; MPI_Type_create_subarray(3, mem_size, local_size, start_indices, MPI_ORDER_C, MPI_CHAR, &memtype); MPI_Type_commit(&memtype); // find my neighbors ------------------------------------------------------ int nxminus, nxplus, nyminus, nyplus, nzminus, nzplus, tag = 333, *neighbors; // Neighbors Array: row- col- col+ row+ plane- plane+ neighbors = (int *) malloc(6 * sizeof(int)); for(i=0; i<6; i++) neighbors[i] = rank; MPI_Cart_shift(comm, 0, 1, &nxminus, &nxplus); MPI_Cart_shift(comm, 1, 1, &nyminus, &nyplus); MPI_Cart_shift(comm, 2, 1, &nzminus, &nzplus); // printf(" %d sending south to %d receiving from %d \n",rank,nxplus,nxminus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nxplus, tag, &(neighbors[0]), 1, MPI_INT, nxminus, tag, comm, &status); // printf(" %d sending North to %d receiving from %d \n",rank,nxminus,nxplus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nxminus, tag, &(neighbors[3]), 1, MPI_INT, nxplus, tag, comm, &status); // printf(" %d sending East to %d receiving from %d \n",rank,nyplus,nyminus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nyplus, tag, &neighbors[1], 1, MPI_INT, nyminus, tag, comm, &status); // printf(" %d sending West to %d receiving from %d \n",rank,nyminus,nyplus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nyminus, tag, &neighbors[2], 1, MPI_INT, nyplus, tag, comm, &status); // printf(" %d sending backwards to %d receiving from %d \n",rank,nzplus,nzminus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nzplus, tag, &(neighbors[4]), 1, MPI_INT, nzminus, tag, comm, &status); // printf(" %d sending forward to %d receiving from %d \n",rank,nzminus,nzplus); // fflush(stdout); MPI_Sendrecv(&rank, 1, MPI_INT, nzminus, tag, &(neighbors[5]), 1, MPI_INT, nzplus, tag, comm, &status); /* printf("neighboors R%d : (row-) %d (col-) %d (col+) %d (row+) %d (plane-) %d (plane+) %d\n",rank,neighbors[0],neighbors[1],neighbors[2],neighbors[3],neighbors[4],neighbors[5]);*/ fflush(stdout); //init_sprng(1,time(0),SPRNG_DEFAULT); srand((unsigned int)time(NULL)); // Open the initial condition (checkpoint or not) ---------------------- if ( checkpoint_resume ) { file_open_error = MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); } else { file_open_error = MPI_File_open(MPI_COMM_WORLD, INITIAL, MPI_MODE_CREATE | MPI_MODE_RDWR, MPI_INFO_NULL, &fh); MPI_File_set_view(fh,0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); } if (file_open_error != MPI_SUCCESS) { if (checkpoint_resume) fputs(CHECKPOINT, stderr); else fputs(INITIAL, stderr); fputs(" could not be opened.\n", stderr); exit(EXIT_FAILURE); } // Allocate and Populate the local array ---------------------------------- local_array_copy_pointer = (char *) malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char)); local_array_copy_2nd = (char **) malloc(mem_size[0] * mem_size[1] * sizeof(char*)); local_array_copy = (char ***) malloc(mem_size[0] * sizeof(char*)); for(i = 0; i < mem_size[0] * mem_size[1]; i++) local_array_copy_2nd[i] = &local_array_copy_pointer[i * mem_size[2]]; for(i = 0; i < mem_size[0]; i++) local_array_copy[i] = &local_array_copy_2nd[i * mem_size[1]]; local_array_pointer = (char *) malloc(mem_size[0] * mem_size[1] * mem_size[2] * sizeof(char)); local_array_2nd = (char **) malloc(mem_size[0] * mem_size[1] * sizeof(char*)); local_array = (char ***) malloc(mem_size[0] * sizeof(char*)); for(i = 0; i < mem_size[0] * mem_size[1]; i++) local_array_2nd[i] = &local_array_pointer[i * mem_size[2]]; for(i = 0; i < mem_size[0]; i++) local_array[i] = &local_array_2nd[i * mem_size[1]]; // if (rank==0) printf("Malloc complete\n"); for(row=0; row<mem_size[0]; row++) { for(col=0; col<mem_size[1]; col++) { for(pln=0; pln<mem_size[2]; pln++) { local_array[row][col][pln] = local_array_copy[row][col][pln] = '0'; } } } // if (rank==0) printf("Setup complete\n"); MPI_File_read_all(fh, local_array_pointer, 1, memtype, &status); if (rank==0) printf("File Read\n"); // if (rank==0) { // for(row=0; row<mem_size[0]; row++) { // for(col=0; col<mem_size[1]; col++) { // for(pln=0; pln<mem_size[2]; pln++) { // printf("%c", local_array[row][col][pln]); // } // printf("\n"); // } // printf("-----------------------\n"); // } // } MPI_File_close(&fh); // Construct the plane data types MPI_Datatype yzplane; MPI_Type_vector(local_size[1], local_size[2], local_size[2]+2, MPI_CHAR, &yzplane); MPI_Type_commit(&yzplane); MPI_Datatype xzplane; MPI_Type_vector(local_size[0], local_size[2], ((local_size[2]+2)*local_size[1])+((local_size[2]+2)*2), MPI_CHAR, &xzplane); MPI_Type_commit(&xzplane); // this type will also copy the corner x columns, can't skip blocks intermittently // since we aren't worrying about the corner data, it's ok MPI_Datatype xyplane; MPI_Type_vector((local_size[0]*local_size[1])+((local_size[0]*2)-2), 1, local_size[2]+2, MPI_CHAR, &xyplane); MPI_Type_commit(&xyplane); MPI_Barrier(comm); // start the iteration loop int iterations; int kCounter = k; for (iterations = 0; iterations < j; iterations++) { // send updated planes // Neighbors Array: // 0 1 2 3 4 5 // row- col- col+ row+ plane- plane+ // Note: corners are not handled // send top yzplane if (rank != neighbors[0]) MPI_Send(&local_array[1][1][1], 1, yzplane, neighbors[0], 0, comm); // recv bottom yzplane if (rank != neighbors[3]) MPI_Recv(&local_array[local_size[0]+1][1][1], 1, yzplane, neighbors[3], 0, comm, &status); // send bottom yzplane if (rank != neighbors[3]) MPI_Send(&local_array[local_size[0]][1][1], 1, yzplane, neighbors[3], 0, comm); // recv top yzplane if (rank != neighbors[0]) MPI_Recv(&local_array[0][1][1], 1, yzplane, neighbors[0], 0, comm, &status); // send left xzplane if (rank != neighbors[1]) MPI_Send(&local_array[1][1][1], 1, xzplane, neighbors[1], 0, comm); // recv right xzplane if (rank != neighbors[2]) MPI_Recv(&local_array[1][local_size[1]+1][1], 1, xzplane, neighbors[2], 0, comm, &status); // send right xzplane if (rank != neighbors[2]) MPI_Send(&local_array[1][local_size[1]][1], 1, xzplane, neighbors[2], 0, comm); // recv left xzplane if (rank != neighbors[1]) MPI_Recv(&local_array[1][0][1], 1, xzplane, neighbors[1], 0, comm, &status); // send front xyplane if (rank != neighbors[4]) MPI_Send(&local_array[1][1][1], 1, xyplane, neighbors[4], 0, comm); // recv back xyplane if (rank != neighbors[5]) MPI_Recv(&local_array[1][1][local_size[2]+1], 1, xyplane, neighbors[5], 0, comm, &status); // send back xyplane if (rank != neighbors[5]) MPI_Send(&local_array[1][1][local_size[2]], 1, xyplane, neighbors[5], 0, comm); // recv front xyplane if (rank != neighbors[4]) MPI_Recv(&local_array[1][1][0], 1, xyplane, neighbors[4], 0, comm, &status); // if (rank==0) { // for(row=0; row<mem_size[0]; row++) { // for(col=0; col<mem_size[1]; col++) { // for(pln=0; pln<mem_size[2]; pln++) { // printf("%c", local_array[row][col][pln]); // } // printf("\n"); // } // printf("-----------------------\n"); // } // } // run the game of life // gameOfLife(local_array, local_array_copy, local_size[0], local_size[1], l, rank); // swap the arrays // temp1 = local_array; // local_array = local_array_copy; // local_array_copy = temp1; // // temp2 = local_array_pointer; // local_array_pointer = local_array_copy_pointer; // local_array_copy_pointer = temp2; // check to see if this iteration needs a checkpoint kCounter--; if (kCounter == 0) { kCounter = k; // checkpoint code MPI_File_open(MPI_COMM_WORLD, CHECKPOINT, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status); MPI_File_close(&fh); if (rank == 0) printf("Checkpoint made: Iteration %d\n", iterations+1); } // end if kCounter == 0 } // end iteration loop iterations--; // all done! repeat the checkpoint process MPI_File_open(MPI_COMM_WORLD, FINAL_RESULTS, MPI_MODE_CREATE | MPI_MODE_WRONLY, MPI_INFO_NULL, &fh); MPI_File_set_view(fh, 0, MPI_CHAR, filetype, "native", MPI_INFO_NULL); MPI_File_write_all(fh, local_array_pointer, 1, memtype, &status); MPI_File_close(&fh); if (rank == 0) printf("Final Results made: Iteration %d\n", iterations+1); time2 = MPI_Wtime(); if (rank == 0) printf("Elapsed Seconds: %f\n", time2-time1);fflush(stdout); MPI_Finalize(); return EXIT_SUCCESS; }
void subfind_contamination(void) { int i, j, ndone, ndone_flag, dummy, count; int ngrp, sendTask, recvTask, place, nexport, nimport; struct unbind_data *d; d = (struct unbind_data *) mymalloc(NumPart * sizeof(struct unbind_data)); for(i = 0, count = 0; i < NumPart; i++) #ifdef DENSITY_SPLIT_BY_TYPE if(!((1 << P[i].Type) & (DENSITY_SPLIT_BY_TYPE))) #else if(!((1 << P[i].Type) & (FOF_PRIMARY_LINK_TYPES))) #endif d[count++].index = i; force_treebuild(count, d); /* construct tree only with boundary particles */ myfree(d); /* allocate buffers to arrange communication */ All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + sizeof(struct contamdata_in) + sizeof(struct contamdata_out) + sizemax(sizeof(struct contamdata_in), sizeof(struct contamdata_out)))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); /* we will repeat the whole thing for those groups where we didn't converge to a SO radius yet */ i = 0; /* begin with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i < Ngroups; i++) { if(Group[i].R_Mean200 > 0) { if(subfind_contamination_evaluate(i, 0, &nexport, Send_count) < 0) break; } else { Group[i].ContaminationLen = 0; Group[i].ContaminationMass = 0; } } qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } ContamGet = (struct contamdata_in *) mymalloc(nimport * sizeof(struct contamdata_in)); ContamIn = (struct contamdata_in *) mymalloc(nexport * sizeof(struct contamdata_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; ContamIn[j].Pos[0] = Group[place].Pos[0]; ContamIn[j].Pos[1] = Group[place].Pos[1]; ContamIn[j].Pos[2] = Group[place].Pos[2]; ContamIn[j].R200 = Group[place].R_Mean200; memcpy(ContamIn[j].NodeList, DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int)); } /* exchange data */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the data */ MPI_Sendrecv(&ContamIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct contamdata_in), MPI_BYTE, recvTask, TAG_DENS_A, &ContamGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct contamdata_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } myfree(ContamIn); ContamResult = (struct contamdata_out *) mymalloc(nimport * sizeof(struct contamdata_out)); ContamOut = (struct contamdata_out *) mymalloc(nexport * sizeof(struct contamdata_out)); /* now do the locations that were sent to us */ for(j = 0; j < nimport; j++) subfind_contamination_evaluate(j, 1, &dummy, &dummy); if(i >= Ngroups) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get the result */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* send the results */ MPI_Sendrecv(&ContamResult[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct contamdata_out), MPI_BYTE, recvTask, TAG_DENS_B, &ContamOut[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct contamdata_out), MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } /* add the result to the local particles */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; Group[place].ContaminationLen += ContamOut[j].ContaminationLen; Group[place].ContaminationMass += ContamOut[j].ContaminationMass; } myfree(ContamOut); myfree(ContamResult); myfree(ContamGet); } while(ndone < NTask); myfree(DataNodeList); myfree(DataIndexTable); }
double runBicomTest(int procs, int bufsize, int iters, int rank, int wsize, int procsPerNode, char allocPattern, int useBarrier, int useNearestRank, MPI_Comm *activeComm) { int i, currtarg; double start, diff, max; char *sendbuf, *recvbuf; double totalops; MPI_Status stat; currtarg = getTargetRank(rank, wsize, procsPerNode, useNearestRank); diff = 0; if ( isActiveProc(rank, wsize, procsPerNode, procs, allocPattern, useNearestRank) ) { sendbuf = (char*)malloc(bufsize); recvbuf = (char*)malloc(bufsize); memset(sendbuf, 0, bufsize); memset(recvbuf, 0, bufsize); /* Ensure communication paths have been initialized */ MPI_Sendrecv(sendbuf, bufsize, MPI_CHAR, currtarg, 0, recvbuf, bufsize, MPI_CHAR, currtarg, 0, MPI_COMM_WORLD, &stat); //generic_barrier(*activeComm); MPI_Barrier(*activeComm); //generic_barrier(*activeComm); MPI_Barrier(*activeComm); /* Time operation loop */ start = MPI_Wtime(); for ( i = 0; i < iters; i++ ) { MPI_Sendrecv(sendbuf, bufsize, MPI_CHAR, currtarg, 0, recvbuf, bufsize, MPI_CHAR, currtarg, 0, MPI_COMM_WORLD, &stat); } if ( useBarrier ) MPI_Barrier(*activeComm); //generic_barrier(*activeComm); diff = MPI_Wtime() - start; free(sendbuf); free(recvbuf); } MPI_Barrier(MPI_COMM_WORLD); /* Get maximum sample length */ MPI_Reduce(&diff, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if ( rank == 0 ) { totalops = (double)(iters * procs); /* Bandwidth is calculated as : (# of processes * operations per time sample * message size) ------------------------------------------------------------ maximum sample length of all processes */ if ( max > 0 ) { printf(outputFormat, "Bidirectional", procs, bufsize, ((double)totalops*(double)bufsize/max)/1000000, max/iters*1000000); } else { printf("Invalid measurement. Increase number of operation per measurement.\n"); } } return max; }
void CSysMatrix::SendReceive_Solution(CSysVector & x, CGeometry *geometry, CConfig *config) { unsigned short iVar, iMarker, MarkerS, MarkerR; unsigned long iVertex, iPoint, nVertexS, nVertexR, nBufferS_Vector, nBufferR_Vector; double *Buffer_Receive = NULL, *Buffer_Send = NULL; int send_to, receive_from; #ifdef HAVE_MPI MPI_Status status; #endif for (iMarker = 0; iMarker < config->GetnMarker_All(); iMarker++) { if ((config->GetMarker_All_KindBC(iMarker) == SEND_RECEIVE) && (config->GetMarker_All_SendRecv(iMarker) > 0)) { MarkerS = iMarker; MarkerR = iMarker+1; send_to = config->GetMarker_All_SendRecv(MarkerS)-1; receive_from = abs(config->GetMarker_All_SendRecv(MarkerR))-1; nVertexS = geometry->nVertex[MarkerS]; nVertexR = geometry->nVertex[MarkerR]; nBufferS_Vector = nVertexS*nVar; nBufferR_Vector = nVertexR*nVar; /*--- Allocate Receive and send buffers ---*/ Buffer_Receive = new double [nBufferR_Vector]; Buffer_Send = new double[nBufferS_Vector]; /*--- Copy the solution that should be sended ---*/ for (iVertex = 0; iVertex < nVertexS; iVertex++) { iPoint = geometry->vertex[MarkerS][iVertex]->GetNode(); for (iVar = 0; iVar < nVar; iVar++) Buffer_Send[iVertex*nVar+iVar] = x[iPoint*nVar+iVar]; } #ifdef HAVE_MPI /*--- Send/Receive information using Sendrecv ---*/ MPI_Sendrecv(Buffer_Send, nBufferS_Vector, MPI_DOUBLE, send_to, 0, Buffer_Receive, nBufferR_Vector, MPI_DOUBLE, receive_from, 0, MPI_COMM_WORLD, &status); #else /*--- Receive information without MPI ---*/ for (iVertex = 0; iVertex < nVertexR; iVertex++) { iPoint = geometry->vertex[MarkerR][iVertex]->GetNode(); for (iVar = 0; iVar < nVar; iVar++) Buffer_Receive[iVar*nVertexR+iVertex] = Buffer_Send[iVar*nVertexR+iVertex]; } #endif /*--- Deallocate send buffer ---*/ delete [] Buffer_Send; /*--- Do the coordinate transformation ---*/ for (iVertex = 0; iVertex < nVertexR; iVertex++) { /*--- Find point and its type of transformation ---*/ iPoint = geometry->vertex[MarkerR][iVertex]->GetNode(); /*--- Copy transformed conserved variables back into buffer. ---*/ for (iVar = 0; iVar < nVar; iVar++) x[iPoint*nVar+iVar] = Buffer_Receive[iVertex*nVar+iVar]; } /*--- Deallocate receive buffer ---*/ delete [] Buffer_Receive; } } }
void dd_sendrecv2_rvec(const gmx_domdec_t gmx_unused *dd, int gmx_unused ddimind, rvec gmx_unused *buf_s_fw, int gmx_unused n_s_fw, rvec gmx_unused *buf_r_fw, int gmx_unused n_r_fw, rvec gmx_unused *buf_s_bw, int gmx_unused n_s_bw, rvec gmx_unused *buf_r_bw, int gmx_unused n_r_bw) { #ifdef GMX_MPI int rank_fw, rank_bw, nreq; MPI_Request req[4]; MPI_Status stat[4]; rank_fw = dd->neighbor[ddimind][0]; rank_bw = dd->neighbor[ddimind][1]; if (!dd->bSendRecv2) { /* Try to send and receive in two directions simultaneously. * Should be faster, especially on machines * with full 3D communication networks. * However, it could be that communication libraries are * optimized for MPI_Sendrecv and non-blocking MPI calls * are slower. * SendRecv2 can be turned on with the env.var. GMX_DD_SENDRECV2 */ nreq = 0; if (n_r_fw) { MPI_Irecv(buf_r_fw[0], n_r_fw*sizeof(rvec), MPI_BYTE, rank_bw, 0, dd->mpi_comm_all, &req[nreq++]); } if (n_r_bw) { MPI_Irecv(buf_r_bw[0], n_r_bw*sizeof(rvec), MPI_BYTE, rank_fw, 1, dd->mpi_comm_all, &req[nreq++]); } if (n_s_fw) { MPI_Isend(buf_s_fw[0], n_s_fw*sizeof(rvec), MPI_BYTE, rank_fw, 0, dd->mpi_comm_all, &req[nreq++]); } if (n_s_bw) { MPI_Isend(buf_s_bw[0], n_s_bw*sizeof(rvec), MPI_BYTE, rank_bw, 1, dd->mpi_comm_all, &req[nreq++]); } if (nreq) { MPI_Waitall(nreq, req, stat); } } else { /* Communicate in two ordered phases. * This is slower, even on a dual-core Opteron cluster * with a single full-duplex network connection per machine. */ /* Forward */ MPI_Sendrecv(buf_s_fw[0], n_s_fw*sizeof(rvec), MPI_BYTE, rank_fw, 0, buf_r_fw[0], n_r_fw*sizeof(rvec), MPI_BYTE, rank_bw, 0, dd->mpi_comm_all, &stat[0]); /* Backward */ MPI_Sendrecv(buf_s_bw[0], n_s_bw*sizeof(rvec), MPI_BYTE, rank_bw, 0, buf_r_bw[0], n_r_bw*sizeof(rvec), MPI_BYTE, rank_fw, 0, dd->mpi_comm_all, &stat[0]); } #endif }
int main( int argc, char **argv ) { MPI_Request r1; int size, rank; int err = 0; int partner, buf[10], flag, idx, index; MPI_Status status; MPI_Init( &argc, &argv ); MPI_Comm_size( MPI_COMM_WORLD, &size ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); if (size < 2) { printf( "Cancel test requires at least 2 processes\n" ); MPI_Abort( MPI_COMM_WORLD, 1 ); } /* * Here is the test. First, we ensure an unsatisfied Irecv: * process 0 process size-1 * Sendrecv Sendrecv * Irecv ---- * Cancel ---- * Sendrecv Sendrecv * Next, we confirm receipt before canceling * Irecv Send * Sendrecv Sendrecv * Cancel */ if (rank == 0) { partner = size - 1; /* Cancel succeeds for wait/waitall */ MPI_Recv_init( buf, 10, MPI_INT, partner, 0, MPI_COMM_WORLD, &r1 ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Start( &r1 ); MPI_Cancel( &r1 ); MPI_Wait( &r1, &status ); MPI_Test_cancelled( &status, &flag ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); if (!flag) { err++; printf( "Cancel of a receive failed where it should succeed (Wait).\n" ); } MPI_Request_free( &r1 ); /* Cancel fails for test/testall */ buf[0] = -1; MPI_Recv_init( buf, 10, MPI_INT, partner, 2, MPI_COMM_WORLD, &r1 ); MPI_Start( &r1 ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Cancel( &r1 ); MPI_Test( &r1, &flag, &status ); MPI_Test_cancelled( &status, &flag ); if (flag) { err++; printf( "Cancel of a receive succeeded where it shouldn't (Test).\n" ); if (buf[0] != -1) { printf( "Receive buffer changed even though cancel suceeded! (Test).\n" ); } } MPI_Request_free( &r1 ); /* Cancel succeeds for waitany */ MPI_Recv_init( buf, 10, MPI_INT, partner, 0, MPI_COMM_WORLD, &r1 ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Start( &r1 ); MPI_Cancel( &r1 ); MPI_Waitany( 1, &r1, &idx, &status ); MPI_Test_cancelled( &status, &flag ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); if (!flag) { err++; printf( "Cancel of a receive failed where it should succeed (Waitany).\n" ); } MPI_Request_free( &r1 ); /* Cancel fails for testany */ buf[0] = -1; MPI_Recv_init( buf, 10, MPI_INT, partner, 2, MPI_COMM_WORLD, &r1 ); MPI_Start( &r1 ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Cancel( &r1 ); MPI_Testany( 1, &r1, &idx, &flag, &status ); MPI_Test_cancelled( &status, &flag ); if (flag) { err++; printf( "Cancel of a receive succeeded where it shouldn't (Testany).\n" ); if (buf[0] != -1) { printf( "Receive buffer changed even though cancel suceeded! (Test).\n" ); } } MPI_Request_free( &r1 ); /* Cancel succeeds for waitsome */ MPI_Recv_init( buf, 10, MPI_INT, partner, 0, MPI_COMM_WORLD, &r1 ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Start( &r1 ); MPI_Cancel( &r1 ); MPI_Waitsome( 1, &r1, &idx, &index, &status ); MPI_Test_cancelled( &status, &flag ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); if (!flag) { err++; printf( "Cancel of a receive failed where it should succeed (Waitsome).\n" ); } MPI_Request_free( &r1 ); /* Cancel fails for testsome*/ buf[0] = -1; MPI_Recv_init( buf, 10, MPI_INT, partner, 2, MPI_COMM_WORLD, &r1 ); MPI_Start( &r1 ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Cancel( &r1 ); MPI_Testsome( 1, &r1, &idx, &index, &status ); MPI_Test_cancelled( &status, &flag ); if (flag) { err++; printf( "Cancel of a receive succeeded where it shouldn't (Testsome).\n" ); if (buf[0] != -1) { printf( "Receive buffer changed even though cancel suceeded! (Testsome).\n" ); } } MPI_Request_free( &r1 ); if (err) { printf( "Test failed with %d errors.\n", err ); } else { printf( " No Errors\n" ); } } else if (rank == size - 1) { partner = 0; /* Cancel succeeds for wait/waitall */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); /* Cancel fails for test/testall */ buf[0] = 3; MPI_Send( buf, 3, MPI_INT, partner, 2, MPI_COMM_WORLD ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); /* Cancel succeeds for waitany */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); /* Cancel fails for testany */ MPI_Send( buf, 3, MPI_INT, partner, 2, MPI_COMM_WORLD ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); /* Cancel succeeds for waitsome */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); /* Cancel fails for waitsome */ MPI_Send( buf, 3, MPI_INT, partner, 2, MPI_COMM_WORLD ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_BOTTOM, 0, MPI_INT, partner, 1, MPI_COMM_WORLD, &status ); /* Next test - check that a cancel for a request receive from MPI_PROC_NULL succeeds (there is some suspicion that some systems can't handle this - also, MPI_REQUEST_NULL */ /* A null request is an error. (null objects are errors unless otherwise allowed) r1 = MPI_REQUEST_NULL; MPI_Cancel( &r1 ); */ MPI_Recv_init( buf, 10, MPI_INT, MPI_PROC_NULL, 0, MPI_COMM_WORLD, &r1 ); MPI_Start( &r1 ); MPI_Cancel( &r1 ); MPI_Request_free( &r1 ); /* Must complete cancel. We know that it won't complete, so we don't need to do anything else */ } MPI_Finalize(); return 0; }
/* ************************************************************************ */ static void calculate_jacobi (struct calculation_arguments const* arguments, struct calculation_results *results, struct options const* options) { int i, j; /* local variables for loops */ int m1, m2; /* used as indices for old and new matrices */ double star; /* four times center value minus 4 neigh.b values */ double residuum; /* residuum of current iteration */ double maxresiduum; /* maximum residuum value of a slave in iteration */ const int nproc = arguments->nproc; const int rank = arguments->rank; int const N = arguments->N; int const N_global = arguments->N_global; double const h = arguments->h; int term_iteration = options->term_iteration; /* initialize m1 and m2 depending on algorithm */ if (options->method == METH_JACOBI) { m1 = 0; m2 = 1; } else { m1 = 0; m2 = 0; } while (term_iteration > 0) { double** Matrix_Out = arguments->Matrix[m1]; double** Matrix_In = arguments->Matrix[m2]; maxresiduum = 0; /* over all rows */ for (i = 1; i < N; i++) { /* over all columns */ for (j = 1; j < N_global; j++) { star = 0.25 * (Matrix_In[i-1][j] + Matrix_In[i][j-1] + Matrix_In[i][j+1] + Matrix_In[i+1][j]); if (options->inf_func == FUNC_FPISIN) { star += (0.25 * TWO_PI_SQUARE * h * h) * sin((PI * h) * ((double)i + arguments->offset)) * sin((PI * h) * (double)j); } if (options->termination == TERM_PREC || term_iteration == 1) { residuum = Matrix_In[i][j] - star; residuum = (residuum < 0) ? -residuum : residuum; maxresiduum = (residuum < maxresiduum) ? maxresiduum : residuum; } Matrix_Out[i][j] = star; } } // Communicate lines with each other into each other's extra allocated line if(rank > 0) { MPI_Sendrecv(Matrix_Out[1], N_global, MPI_DOUBLE, rank - 1, rank , Matrix_Out[0], N_global, MPI_DOUBLE, rank - 1, rank - 1, MPI_COMM_WORLD, NULL); } // Last rank can't communicate with higher ranks because there aren't any if(rank != nproc - 1) { MPI_Sendrecv(Matrix_Out[N - 1], N_global, MPI_DOUBLE, rank + 1, rank, Matrix_Out[N], N_global, MPI_DOUBLE, rank + 1, rank + 1, MPI_COMM_WORLD, NULL); } /* exchange m1 and m2 */ i = m1; m1 = m2; m2 = i; // Find lowest maxresiduum in whole process swarm MPI_Allreduce(MPI_IN_PLACE, &maxresiduum, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); results->stat_iteration++; results->stat_precision = maxresiduum; /* check for stopping calculation, depending on termination method */ if (options->termination == TERM_PREC) { if (maxresiduum < options->term_precision) { term_iteration = 0; } } else if (options->termination == TERM_ITER) { term_iteration--; } } results->m = m2; }
void transpose_start_exchange_step(transpose_mpi_plan p, int el_size, TRANSPOSE_EL_TYPE *local_data, TRANSPOSE_EL_TYPE *send_buf, int step, transpose_sync_type sync_type) { if (p->local_nx > 0 || p->local_ny > 0) { transpose_mpi_exchange *exchange = p->exchange; int block = exchange[step].block_num; int send_block_size = p->send_block_size; int recv_block_size = p->recv_block_size; if (exchange[step].dest_pe != p->my_pe) { /* first, copy to send buffer: */ if (exchange[step].send_size > 0) memcpy(send_buf, local_data + el_size*send_block_size*block, el_size * exchange[step].send_size * sizeof(TRANSPOSE_EL_TYPE)); #define DO_ISEND \ if (exchange[step].send_size > 0) { \ ISEND(send_buf, \ exchange[step].send_size * el_size, \ p->el_type, \ exchange[step].dest_pe, 0, \ p->comm, \ &p->request[0]); \ } p->request[0] = MPI_REQUEST_NULL; p->request[1] = MPI_REQUEST_NULL; if (sync_type == TRANSPOSE_ASYNC) { /* Note that we impose an ordering on the sends and receives (lower pe sends first) so that we won't have deadlock if Isend & Irecv are blocking in some MPI implementation: */ if (p->my_pe < exchange[step].dest_pe) DO_ISEND; if (exchange[step].recv_size > 0) { MPI_Irecv(local_data + el_size*recv_block_size*block, exchange[step].recv_size * el_size, p->el_type, exchange[step].dest_pe, MPI_ANY_TAG, p->comm, &p->request[1]); } if (p->my_pe > exchange[step].dest_pe) DO_ISEND; } else /* (sync_type == TRANSPOSE_SYNC) */ { MPI_Status status; MPI_Sendrecv(send_buf, exchange[step].send_size * el_size, p->el_type, exchange[step].dest_pe, 0, local_data + el_size*recv_block_size*block, exchange[step].recv_size * el_size, p->el_type, exchange[step].dest_pe, MPI_ANY_TAG, p->comm, &status); } } else if (exchange[step].recv_size > 0 && recv_block_size != send_block_size) memmove(local_data + el_size*recv_block_size*block, local_data + el_size*send_block_size*block, exchange[step].recv_size * el_size * sizeof(TRANSPOSE_EL_TYPE)); } }
int main( int argc, char *argv[] ) { int errs = 0; int rank, size, source, dest; unsigned char *buf, *bufp; int minsize = 2; int i, msgsize, bufsize, outsize; unsigned char *msg1, *msg2, *msg3; MPI_Comm comm; MPI_Status status1, status2, status3; MTest_Init( &argc, &argv ); /* The following illustrates the use of the routines to run through a selection of communicators and datatypes. Use subsets of these for tests that do not involve combinations of communicators, datatypes, and counts of datatypes */ msgsize = 128 * 1024; msg1 = (unsigned char *)malloc( 3 * msgsize ); msg2 = msg1 + msgsize; msg3 = msg2 + msgsize; while (MTestGetIntracommGeneral( &comm, minsize, 1 )) { if (comm == MPI_COMM_NULL) continue; /* Determine the sender and receiver */ MPI_Comm_rank( comm, &rank ); MPI_Comm_size( comm, &size ); source = 0; dest = size - 1; /* Here is the test: The sender */ if (rank == source) { /* Get a bsend buffer. Make it large enough that the Bsend internals will (probably) not use a eager send for the data. Have three such messages */ bufsize = 3 * (MPI_BSEND_OVERHEAD + msgsize); buf = (unsigned char *)malloc( bufsize ); if (!buf) { fprintf( stderr, "Unable to allocate a buffer of %d bytes\n", bufsize ); MPI_Abort( MPI_COMM_WORLD, 1 ); } MPI_Buffer_attach( buf, bufsize ); /* Initialize the buffers */ for (i=0; i<msgsize; i++) { msg1[i] = 0xff ^ (i & 0xff); msg2[i] = 0xff ^ (3*i & 0xff); msg3[i] = 0xff ^ (5*i & 0xff); } /* Initiate the bsends */ MPI_Bsend( msg1, msgsize, MPI_CHAR, dest, 0, comm ); MPI_Bsend( msg2, msgsize, MPI_CHAR, dest, 0, comm ); MPI_Bsend( msg3, msgsize, MPI_CHAR, dest, 0, comm ); /* Synchronize with our partner */ MPI_Sendrecv( 0, 0, MPI_CHAR, dest, 10, 0, 0, MPI_CHAR, dest, 10, comm, MPI_STATUS_IGNORE ); /* Detach the buffers. There should be pending operations */ MPI_Buffer_detach ( &bufp, &outsize ); if (bufp != buf) { fprintf( stderr, "Wrong buffer returned\n" ); errs++; } if (outsize != bufsize) { fprintf( stderr, "Wrong buffer size returned\n" ); errs++; } } else if (rank == dest) { double tstart; /* Clear the message buffers */ for (i=0; i<msgsize; i++) { msg1[i] = 0; msg2[i] = 0; msg3[i] = 0; } /* Wait for the synchronize */ MPI_Sendrecv( 0, 0, MPI_CHAR, source, 10, 0, 0, MPI_CHAR, source, 10, comm, MPI_STATUS_IGNORE ); /* Wait 2 seconds */ tstart = MPI_Wtime(); while (MPI_Wtime() - tstart < 2.0) ; /* Now receive the messages */ MPI_Recv( msg1, msgsize, MPI_CHAR, source, 0, comm, &status1 ); MPI_Recv( msg2, msgsize, MPI_CHAR, source, 0, comm, &status2 ); MPI_Recv( msg3, msgsize, MPI_CHAR, source, 0, comm, &status3 ); /* Check that we have the correct data */ for (i=0; i<msgsize; i++) { if (msg1[i] != (0xff ^ (i & 0xff))) { if (errs < 10) { fprintf( stderr, "msg1[%d] = %d\n", i, msg1[i] ); } errs++; } if (msg2[i] != (0xff ^ (3*i & 0xff))) { if (errs < 10) { fprintf( stderr, "msg2[%d] = %d\n", i, msg2[i] ); } errs++; } if (msg3[i] != (0xff ^ (5*i & 0xff))) { if (errs < 10) { fprintf( stderr, "msg2[%d] = %d\n", i, msg2[i] ); } errs++; } } } MTestFreeComm( &comm ); } free( msg1 ); MTest_Finalize( errs ); MPI_Finalize(); return 0; }
void bsmooth(void) { long long ntot, ntotleft; int *noffset, *nbuffer, *nsend, *nsend_local, *ndonelist, *numlist; int i, j, n; int ndone; int maxfill, source; int level, ngrp, sendTask, recvTask; int place, nexport; double tstart, tend; double maxt, sumt, sumcomm; double timecomp = 0, timeimbalance = 0, timecommsumm = 0; double timengb, sumtimengb; MPI_Status status; int Smooth_Flag = 0; double dB; if(ThisTask == 0) printf("Flag_FullStep = %d, Main TimestepCounts = %d\n", Flag_FullStep, All.MainTimestepCounts); if(Flag_FullStep == 1) { if((All.MainTimestepCounts % All.BSmoothInt == 0) && (All.BSmoothInt >= 0)) { Smooth_Flag = 1; if(ThisTask == 0) printf("Smoothing B %d, %f\n", All.BSmoothInt, All.BSmoothFrac); } All.MainTimestepCounts++; } noffset = mymalloc(sizeof(int) * NTask); /* offsets of bunches in common list */ nbuffer = mymalloc(sizeof(int) * NTask); nsend_local = mymalloc(sizeof(int) * NTask); nsend = mymalloc(sizeof(int) * NTask * NTask); ndonelist = mymalloc(sizeof(int) * NTask); for(n = 0, NumSphUpdate = 0; n < N_gas; n++) { #ifdef SFR if(P[n].Type == 0) #endif { SphP[n].Left = SphP[n].Right = 0; if(P[n].Ti_endstep == All.Ti_Current) NumSphUpdate++; } } numlist = mymalloc(NTask * sizeof(int) * NTask); MPI_Allgather(&NumSphUpdate, 1, MPI_INT, numlist, 1, MPI_INT, MPI_COMM_WORLD); for(i = 0, ntot = 0; i < NTask; i++) ntot += numlist[i]; myfree(numlist); i = 0; /* beginn with this index */ ntotleft = ntot; /* particles left for all tasks together */ while(ntotleft > 0) { for(j = 0; j < NTask; j++) nsend_local[j] = 0; /* do local particles and prepare export list */ tstart = second(); for(nexport = 0, ndone = 0; i < N_gas && nexport < All.BunchSizeDensity - NTask; i++) #ifdef SFR if(P[i].Type == 0) #endif if(P[i].Ti_endstep == All.Ti_Current) { ndone++; for(j = 0; j < NTask; j++) Exportflag[j] = 0; bsmooth_evaluate(i, 0); for(j = 0; j < NTask; j++) { if(Exportflag[j]) { DensDataIn[nexport].Pos[0] = P[i].Pos[0]; DensDataIn[nexport].Pos[1] = P[i].Pos[1]; DensDataIn[nexport].Pos[2] = P[i].Pos[2]; DensDataIn[nexport].Hsml = PPP[i].Hsml; DensDataIn[nexport].Index = i; DensDataIn[nexport].Task = j; nexport++; nsend_local[j]++; } } } tend = second(); timecomp += timediff(tstart, tend); qsort(DensDataIn, nexport, sizeof(struct densdata_in), dens_compare_key); for(j = 1, noffset[0] = 0; j < NTask; j++) noffset[j] = noffset[j - 1] + nsend_local[j - 1]; tstart = second(); MPI_Allgather(nsend_local, NTask, MPI_INT, nsend, NTask, MPI_INT, MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* now do the particles that need to be exported */ for(level = 1; level < (1 << PTask); level++) { tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* get the particles */ MPI_Sendrecv(&DensDataIn[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct densdata_in), MPI_BYTE, recvTask, TAG_BSMTH_A, &DensDataGet[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_in), MPI_BYTE, recvTask, TAG_BSMTH_A, MPI_COMM_WORLD, &status); } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); tstart = second(); for(j = 0; j < nbuffer[ThisTask]; j++) { bsmooth_evaluate(j, 1); } tend = second(); timecomp += timediff(tstart, tend); /* do a block to measure imbalance */ tstart = second(); MPI_Barrier(MPI_COMM_WORLD); tend = second(); timeimbalance += timediff(tstart, tend); /* get the result */ tstart = second(); for(j = 0; j < NTask; j++) nbuffer[j] = 0; for(ngrp = level; ngrp < (1 << PTask); ngrp++) { maxfill = 0; for(j = 0; j < NTask; j++) { if((j ^ ngrp) < NTask) if(maxfill < nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]) maxfill = nbuffer[j] + nsend[(j ^ ngrp) * NTask + j]; } if(maxfill >= All.BunchSizeForce) break; sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(nsend[ThisTask * NTask + recvTask] > 0 || nsend[recvTask * NTask + ThisTask] > 0) { /* send the results */ MPI_Sendrecv(&DensDataResult[nbuffer[ThisTask]], nsend[recvTask * NTask + ThisTask] * sizeof(struct densdata_out), MPI_BYTE, recvTask, TAG_BSMTH_B, &DensDataPartialResult[noffset[recvTask]], nsend_local[recvTask] * sizeof(struct densdata_out), MPI_BYTE, recvTask, TAG_BSMTH_B, MPI_COMM_WORLD, &status); /* add the result to the particles */ for(j = 0; j < nsend_local[recvTask]; j++) { source = j + noffset[recvTask]; place = DensDataIn[source].Index; SphP[place].BSmooth[0] += DensDataPartialResult[source].BSmooth[0]; SphP[place].BSmooth[1] += DensDataPartialResult[source].BSmooth[1]; SphP[place].BSmooth[2] += DensDataPartialResult[source].BSmooth[2]; SphP[place].DensityNorm += DensDataPartialResult[source].DensityNorm; } } } for(j = 0; j < NTask; j++) if((j ^ ngrp) < NTask) nbuffer[j] += nsend[(j ^ ngrp) * NTask + j]; } tend = second(); timecommsumm += timediff(tstart, tend); level = ngrp - 1; } MPI_Allgather(&ndone, 1, MPI_INT, ndonelist, 1, MPI_INT, MPI_COMM_WORLD); for(j = 0; j < NTask; j++) ntotleft -= ndonelist[j]; } /* do final operations on results */ tstart = second(); for(i = 0; i < N_gas; i++) { #ifdef SFR if(P[i].Type == 0) #endif if(P[i].Ti_endstep == All.Ti_Current) { SphP[i].BSmooth[0] /= SphP[i].DensityNorm; SphP[i].BSmooth[1] /= SphP[i].DensityNorm; SphP[i].BSmooth[2] /= SphP[i].DensityNorm; if(Smooth_Flag == 1) { dB = All.BSmoothFrac * (SphP[i].BSmooth[0] - SphP[i].BPred[0]); SphP[i].BPred[0] += dB; SphP[i].B[0] += dB; dB = All.BSmoothFrac * (SphP[i].BSmooth[1] - SphP[i].BPred[1]); SphP[i].BPred[1] += dB; SphP[i].B[1] += dB; dB = All.BSmoothFrac * (SphP[i].BSmooth[2] - SphP[i].BPred[2]); SphP[i].BPred[2] += dB; SphP[i].B[2] += dB; } } } tend = second(); timecomp += timediff(tstart, tend); myfree(ndonelist); myfree(nsend); myfree(nsend_local); myfree(nbuffer); myfree(noffset); /* collect some timing information */ timengb = 0; MPI_Reduce(&timengb, &sumtimengb, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timecomp, &sumt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&timecommsumm, &sumcomm, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); timeimbalance += timecomp + timecommsumm; MPI_Reduce(&timeimbalance, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if(ThisTask == 0) { All.CPU_HydCompWalk += sumt / NTask; All.CPU_HydCommSumm += sumcomm / NTask; All.CPU_HydImbalance += maxt - (sumt + sumcomm) / NTask; All.CPU_EnsureNgb += sumtimengb / NTask; } }
/*! Calculates the long-range potential using the PM method. The potential is * Gaussian filtered with Asmth, given in mesh-cell units. We carry out a CIC * charge assignment, and compute the potenial by Fourier transform * methods. The CIC kernel is deconvolved. */ void pmpotential_periodic(void) { double k2, kx, ky, kz, smth; double dx, dy, dz; double fx, fy, fz, ff; double asmth2, fac; int i, j, slab, level, sendTask, recvTask; int x, y, z, ip; int slab_x, slab_y, slab_z; int slab_xx, slab_yy, slab_zz; int meshmin[3], meshmax[3], sendmin, sendmax, recvmin, recvmax; int rep, ncont, cont_sendmin[2], cont_sendmax[2], cont_recvmin[2], cont_recvmax[2]; int dimx, dimy, dimz, recv_dimx, recv_dimy, recv_dimz; MPI_Status status; if(ThisTask == 0) { printf("Starting periodic PM calculation.\n"); fflush(stdout); } asmth2 = (2 * M_PI) * All.Asmth[0] / All.BoxSize; asmth2 *= asmth2; fac = All.G / (M_PI * All.BoxSize); /* to get potential */ force_treefree(); /* first, establish the extension of the local patch in the PMGRID */ for(j = 0; j < 3; j++) { meshmin[j] = PMGRID; meshmax[j] = 0; } for(i = 0; i < NumPart; i++) { for(j = 0; j < 3; j++) { slab = to_slab_fac * P[i].Pos[j]; if(slab >= PMGRID) slab = PMGRID - 1; if(slab < meshmin[j]) meshmin[j] = slab; if(slab > meshmax[j]) meshmax[j] = slab; } } MPI_Allgather(meshmin, 3, MPI_INT, meshmin_list, 3, MPI_INT, MPI_COMM_WORLD); MPI_Allgather(meshmax, 3, MPI_INT, meshmax_list, 3, MPI_INT, MPI_COMM_WORLD); dimx = meshmax[0] - meshmin[0] + 2; dimy = meshmax[1] - meshmin[1] + 2; dimz = meshmax[2] - meshmin[2] + 2; pm_init_periodic_allocate((dimx + 4) * (dimy + 4) * (dimz + 4)); #ifdef FFTW3 /* Create plan for in-place r2c DFT */ fft_forward_plan = fftw_mpi_plan_dft_r2c_3d(PMGRID, PMGRID, PMGRID, rhogrid, fft_of_rhogrid, MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_OUT); fft_inverse_plan = fftw_mpi_plan_dft_c2r_3d(PMGRID, PMGRID, PMGRID, fft_of_rhogrid, rhogrid, MPI_COMM_WORLD, FFTW_ESTIMATE | FFTW_MPI_TRANSPOSED_IN); #endif for(i = 0; i < dimx * dimy * dimz; i++) workspace[i] = 0; for(i = 0; i < NumPart; i++) { slab_x = to_slab_fac * P[i].Pos[0]; if(slab_x >= PMGRID) slab_x = PMGRID - 1; dx = to_slab_fac * P[i].Pos[0] - slab_x; slab_x -= meshmin[0]; slab_xx = slab_x + 1; slab_y = to_slab_fac * P[i].Pos[1]; if(slab_y >= PMGRID) slab_y = PMGRID - 1; dy = to_slab_fac * P[i].Pos[1] - slab_y; slab_y -= meshmin[1]; slab_yy = slab_y + 1; slab_z = to_slab_fac * P[i].Pos[2]; if(slab_z >= PMGRID) slab_z = PMGRID - 1; dz = to_slab_fac * P[i].Pos[2] - slab_z; slab_z -= meshmin[2]; slab_zz = slab_z + 1; workspace[(slab_x * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * (1.0 - dz); workspace[(slab_x * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (1.0 - dx) * dy * (1.0 - dz); workspace[(slab_x * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * (1.0 - dy) * dz; workspace[(slab_x * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (1.0 - dx) * dy * dz; workspace[(slab_xx * dimy + slab_y) * dimz + slab_z] += P[i].Mass * (dx) * (1.0 - dy) * (1.0 - dz); workspace[(slab_xx * dimy + slab_yy) * dimz + slab_z] += P[i].Mass * (dx) * dy * (1.0 - dz); workspace[(slab_xx * dimy + slab_y) * dimz + slab_zz] += P[i].Mass * (dx) * (1.0 - dy) * dz; workspace[(slab_xx * dimy + slab_yy) * dimz + slab_zz] += P[i].Mass * (dx) * dy * dz; } for(i = 0; i < fftsize; i++) /* clear local density field */ rhogrid[i] = 0; for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */ { sendTask = ThisTask; recvTask = ThisTask ^ level; if(recvTask < NTask) { /* check how much we have to send */ sendmin = 2 * PMGRID; sendmax = -1; for(slab_x = meshmin[0]; slab_x < meshmax[0] + 2; slab_x++) if(slab_to_task[slab_x % PMGRID] == recvTask) { if(slab_x < sendmin) sendmin = slab_x; if(slab_x > sendmax) sendmax = slab_x; } if(sendmax == -1) sendmin = 0; /* check how much we have to receive */ recvmin = 2 * PMGRID; recvmax = -1; for(slab_x = meshmin_list[3 * recvTask]; slab_x < meshmax_list[3 * recvTask] + 2; slab_x++) if(slab_to_task[slab_x % PMGRID] == sendTask) { if(slab_x < recvmin) recvmin = slab_x; if(slab_x > recvmax) recvmax = slab_x; } if(recvmax == -1) recvmin = 0; if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0) /* ok, we have a contribution to the slab */ { recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 2; recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 2; recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 2; if(level > 0) { MPI_Sendrecv(workspace + (sendmin - meshmin[0]) * dimy * dimz, (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_C, forcegrid, (recvmax - recvmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_C, MPI_COMM_WORLD, &status); } else { memcpy(forcegrid, workspace + (sendmin - meshmin[0]) * dimy * dimz, (sendmax - sendmin + 1) * dimy * dimz * sizeof(fftw_real)); } for(slab_x = recvmin; slab_x <= recvmax; slab_x++) { slab_xx = (slab_x % PMGRID) - first_slab_of_task[ThisTask]; if(slab_xx >= 0 && slab_xx < slabs_per_task[ThisTask]) { for(slab_y = meshmin_list[3 * recvTask + 1]; slab_y <= meshmax_list[3 * recvTask + 1] + 1; slab_y++) { slab_yy = slab_y; if(slab_yy >= PMGRID) slab_yy -= PMGRID; for(slab_z = meshmin_list[3 * recvTask + 2]; slab_z <= meshmax_list[3 * recvTask + 2] + 1; slab_z++) { slab_zz = slab_z; if(slab_zz >= PMGRID) slab_zz -= PMGRID; rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz] += forcegrid[((slab_x - recvmin) * recv_dimy + (slab_y - meshmin_list[3 * recvTask + 1])) * recv_dimz + (slab_z - meshmin_list[3 * recvTask + 2])]; } } } } } } } /* Do the FFT of the density field */ #ifdef FFTW3 fftw_execute(fft_forward_plan); #else rfftwnd_mpi(fft_forward_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER); #endif /* multiply with Green's function for the potential */ for(y = slabstart_y; y < slabstart_y + nslab_y; y++) for(x = 0; x < PMGRID; x++) for(z = 0; z < PMGRID / 2 + 1; z++) { if(x > PMGRID / 2) kx = x - PMGRID; else kx = x; if(y > PMGRID / 2) ky = y - PMGRID; else ky = y; if(z > PMGRID / 2) kz = z - PMGRID; else kz = z; k2 = kx * kx + ky * ky + kz * kz; if(k2 > 0) { smth = -exp(-k2 * asmth2) / k2 * fac; /* do deconvolution */ fx = fy = fz = 1; if(kx != 0) { fx = (M_PI * kx) / PMGRID; fx = sin(fx) / fx; } if(ky != 0) { fy = (M_PI * ky) / PMGRID; fy = sin(fy) / fy; } if(kz != 0) { fz = (M_PI * kz) / PMGRID; fz = sin(fz) / fz; } ff = 1 / (fx * fy * fz); smth *= ff * ff * ff * ff; /* end deconvolution */ ip = PMGRID * (PMGRID / 2 + 1) * (y - slabstart_y) + (PMGRID / 2 + 1) * x + z; c_re(fft_of_rhogrid[ip]) *= smth; c_im(fft_of_rhogrid[ip]) *= smth; } } if(slabstart_y == 0) c_re(fft_of_rhogrid[0]) = c_im(fft_of_rhogrid[0]) = 0.0; /* Do the FFT to get the potential */ #ifdef FFTW3 fftw_execute(fft_inverse_plan); /* Now normalize the output */ for(i = 0; i < fftsize; i++) rhogrid[i] = rhogrid[i] / (PMGRID*PMGRID*PMGRID); #else rfftwnd_mpi(fft_inverse_plan, 1, rhogrid, workspace, FFTW_TRANSPOSED_ORDER); #endif /* note: "rhogrid" now contains the potential */ dimx = meshmax[0] - meshmin[0] + 6; dimy = meshmax[1] - meshmin[1] + 6; dimz = meshmax[2] - meshmin[2] + 6; for(level = 0; level < (1 << PTask); level++) /* note: for level=0, target is the same task */ { sendTask = ThisTask; recvTask = ThisTask ^ level; if(recvTask < NTask) { /* check how much we have to send */ sendmin = 2 * PMGRID; sendmax = -PMGRID; for(slab_x = meshmin_list[3 * recvTask] - 2; slab_x < meshmax_list[3 * recvTask] + 4; slab_x++) if(slab_to_task[(slab_x + PMGRID) % PMGRID] == sendTask) { if(slab_x < sendmin) sendmin = slab_x; if(slab_x > sendmax) sendmax = slab_x; } if(sendmax == -PMGRID) sendmin = sendmax + 1; /* check how much we have to receive */ recvmin = 2 * PMGRID; recvmax = -PMGRID; for(slab_x = meshmin[0] - 2; slab_x < meshmax[0] + 4; slab_x++) if(slab_to_task[(slab_x + PMGRID) % PMGRID] == recvTask) { if(slab_x < recvmin) recvmin = slab_x; if(slab_x > recvmax) recvmax = slab_x; } if(recvmax == -PMGRID) recvmin = recvmax + 1; if((recvmax - recvmin) >= 0 || (sendmax - sendmin) >= 0) /* ok, we have a contribution to the slab */ { recv_dimx = meshmax_list[3 * recvTask + 0] - meshmin_list[3 * recvTask + 0] + 6; recv_dimy = meshmax_list[3 * recvTask + 1] - meshmin_list[3 * recvTask + 1] + 6; recv_dimz = meshmax_list[3 * recvTask + 2] - meshmin_list[3 * recvTask + 2] + 6; ncont = 1; cont_sendmin[0] = sendmin; cont_sendmax[0] = sendmax; cont_sendmin[1] = sendmax + 1; cont_sendmax[1] = sendmax; cont_recvmin[0] = recvmin; cont_recvmax[0] = recvmax; cont_recvmin[1] = recvmax + 1; cont_recvmax[1] = recvmax; for(slab_x = sendmin; slab_x <= sendmax; slab_x++) { if(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask) { /* non-contiguous */ cont_sendmax[0] = slab_x - 1; while(slab_to_task[(slab_x + PMGRID) % PMGRID] != ThisTask) slab_x++; cont_sendmin[1] = slab_x; ncont++; } } for(slab_x = recvmin; slab_x <= recvmax; slab_x++) { if(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask) { /* non-contiguous */ cont_recvmax[0] = slab_x - 1; while(slab_to_task[(slab_x + PMGRID) % PMGRID] != recvTask) slab_x++; cont_recvmin[1] = slab_x; if(ncont == 1) ncont++; } } for(rep = 0; rep < ncont; rep++) { sendmin = cont_sendmin[rep]; sendmax = cont_sendmax[rep]; recvmin = cont_recvmin[rep]; recvmax = cont_recvmax[rep]; /* prepare what we want to send */ if(sendmax - sendmin >= 0) { for(slab_x = sendmin; slab_x <= sendmax; slab_x++) { slab_xx = ((slab_x + PMGRID) % PMGRID) - first_slab_of_task[ThisTask]; for(slab_y = meshmin_list[3 * recvTask + 1] - 2; slab_y < meshmax_list[3 * recvTask + 1] + 4; slab_y++) { slab_yy = (slab_y + PMGRID) % PMGRID; for(slab_z = meshmin_list[3 * recvTask + 2] - 2; slab_z < meshmax_list[3 * recvTask + 2] + 4; slab_z++) { slab_zz = (slab_z + PMGRID) % PMGRID; forcegrid[((slab_x - sendmin) * recv_dimy + (slab_y - (meshmin_list[3 * recvTask + 1] - 2))) * recv_dimz + slab_z - (meshmin_list[3 * recvTask + 2] - 2)] = rhogrid[PMGRID * PMGRID2 * slab_xx + PMGRID2 * slab_yy + slab_zz]; } } } } if(level > 0) { MPI_Sendrecv(forcegrid, (sendmax - sendmin + 1) * recv_dimy * recv_dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_D, workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real), MPI_BYTE, recvTask, TAG_PERIODIC_D, MPI_COMM_WORLD, &status); } else { memcpy(workspace + (recvmin - (meshmin[0] - 2)) * dimy * dimz, forcegrid, (recvmax - recvmin + 1) * dimy * dimz * sizeof(fftw_real)); } } } } } dimx = meshmax[0] - meshmin[0] + 2; dimy = meshmax[1] - meshmin[1] + 2; dimz = meshmax[2] - meshmin[2] + 2; recv_dimx = meshmax[0] - meshmin[0] + 6; recv_dimy = meshmax[1] - meshmin[1] + 6; recv_dimz = meshmax[2] - meshmin[2] + 6; for(x = 0; x < meshmax[0] - meshmin[0] + 2; x++) for(y = 0; y < meshmax[1] - meshmin[1] + 2; y++) for(z = 0; z < meshmax[2] - meshmin[2] + 2; z++) { forcegrid[(x * dimy + y) * dimz + z] = workspace[((x + 2) * recv_dimy + (y + 2)) * recv_dimz + (z + 2)]; } /* read out the potential */ for(i = 0; i < NumPart; i++) { slab_x = to_slab_fac * P[i].Pos[0]; if(slab_x >= PMGRID) slab_x = PMGRID - 1; dx = to_slab_fac * P[i].Pos[0] - slab_x; slab_x -= meshmin[0]; slab_xx = slab_x + 1; slab_y = to_slab_fac * P[i].Pos[1]; if(slab_y >= PMGRID) slab_y = PMGRID - 1; dy = to_slab_fac * P[i].Pos[1] - slab_y; slab_y -= meshmin[1]; slab_yy = slab_y + 1; slab_z = to_slab_fac * P[i].Pos[2]; if(slab_z >= PMGRID) slab_z = PMGRID - 1; dz = to_slab_fac * P[i].Pos[2] - slab_z; slab_z -= meshmin[2]; slab_zz = slab_z + 1; P[i].Potential += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_z] * (1.0 - dx) * (1.0 - dy) * (1.0 - dz); P[i].Potential += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_z] * (1.0 - dx) * dy * (1.0 - dz); P[i].Potential += forcegrid[(slab_x * dimy + slab_y) * dimz + slab_zz] * (1.0 - dx) * (1.0 - dy) * dz; P[i].Potential += forcegrid[(slab_x * dimy + slab_yy) * dimz + slab_zz] * (1.0 - dx) * dy * dz; P[i].Potential += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_z] * (dx) * (1.0 - dy) * (1.0 - dz); P[i].Potential += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_z] * (dx) * dy * (1.0 - dz); P[i].Potential += forcegrid[(slab_xx * dimy + slab_y) * dimz + slab_zz] * (dx) * (1.0 - dy) * dz; P[i].Potential += forcegrid[(slab_xx * dimy + slab_yy) * dimz + slab_zz] * (dx) * dy * dz; } pm_init_periodic_free(); force_treeallocate(All.TreeAllocFactor * All.MaxPart, All.MaxPart); All.NumForcesSinceLastDomainDecomp = 1 + All.TotNumPart * All.TreeDomainUpdateFrequency; if(ThisTask == 0) { printf("done PM-Potential.\n"); fflush(stdout); } }
void MODEL::Boundary() { char text[200]; // initializations --------------------------------------------------------------------- if( !boundList ) { boundList = new ELEM* [region->Getnp()]; if( !boundList ) REPORT::rpt.Error( kMemoryFault, "can not allocate memory - MODEL::Boundary(1)" ); } for( int n=0; n<region->Getnp(); n++ ) { NODE* nd = region->Getnode(n); CF( nd->flag, NODE::kBound ); nd->mark = false; // remove slip boundary conditions --------------------------------------------------- if( isFS(nd->bc.kind, BCON::kAutoSlip) ) { CF( nd->bc.kind, BCON::kAutoSlip | BCON::kFixU | BCON::kFixV ); } if( isFS(nd->bc.kind, BCON::kAutoKD) ) { CF( nd->bc.kind, BCON::kAutoKD | BCON::kFixK | BCON::kFixD ); } } // determine boundary midside nodes ---------------------------------------------------- for( int n=0; n<region->Getnp(); n++ ) { NODE* nd = region->Getnode(n); // midside nodes which are connected to only one element are boundary nodes ---------- if( nd->noel == 1 && isFS(nd->flag, NODE::kMidsNode) ) { if( !isFS(nd->flag, NODE::kInface) ) SF( nd->flag, NODE::kBound ); } } // determine number of boundary elements: nb ------------------------------------------- int nb = 0; for( int n=0; n<region->Getnp(); n++ ) { if( isFS(region->Getnode(n)->flag,NODE::kBound) ) nb++; } // allocate memory for boundary elements ----------------------------------------------- bound->Free(); bound->Alloc( 0, nb ); // set up boundary elements ------------------------------------------------------------ int be = 0; // counter for boundary elements for( int re=0; re<region->Getne(); re++ ) { ELEM* el = region->Getelem(re); if( isFS(el->flag, ELEM::kDry) ) continue; int ncn = el->Getncn(); int nnd = el->Getnnd(); for( int i=ncn; i<nnd; i++ ) { // check, if el->nd[i] is a midside boundary node ---------------------------------- if( isFS(el->nd[i]->flag, NODE::kBound) ) { ELEM* bd = bound->Getelem(be); boundList[el->nd[i]->Getno()] = bd; int left = i - ncn; int rght = (left + 1) % ncn; bd->nd[0] = el->nd[left]; // corner nodes bd->nd[1] = el->nd[rght]; bd->nd[2] = el->nd[i]; // midside node SF( bd->nd[0]->flag, NODE::kBound ); SF( bd->nd[1]->flag, NODE::kBound ); // set shape specifications ------------------------------------------------------ bd->Setshape( kLine ); bd->Setname( el->Getname() ); SF( bd->flag, ELEM::kBound ); bd->type = el->type; bd->areaFact = 1.0; be++; } } } //////////////////////////////////////////////////////////////////////////////////////// // communicate boundary nodes //# ifdef _MPI_DBG // REPORT::rpt.Output( " (MODEL::Boundary) communication of boundary nodes", 1 ); //# endif # ifdef _MPI_ if( subdom->npr > 1 ) { INFACE* inface = subdom->inface; // loop on all interfaces: exchange bound flag --------------------------------------- for( int s=0; s<subdom->npr; s++ ) { MPI_Status status; int npinf = inface[s].np; if( npinf > 0 ) { for( int n=0; n<npinf; n++ ) { NODE* nd = inface[s].node[n]; if( isFS(nd->flag, NODE::kBound) ) inface[s].sia1[n] = true; else inface[s].sia1[n] = false; } MPI_Sendrecv( inface[s].sia1, npinf, MPI_CHAR, s, 1, inface[s].ria1, npinf, MPI_CHAR, s, 1, MPI_COMM_WORLD, &status ); for( int n=0; n<npinf; n++ ) { NODE* nd = inface[s].node[n]; if( inface[s].ria1[n] ) SF( nd->flag, NODE::kBound ); } } } } # endif //////////////////////////////////////////////////////////////////////////////////////// // ------------------------------------------------------------------------------------- // count for newly required boundary conditions // note: (sc, 30.10.2004) // a boundary condition is needed for marsh-nodes in case of dry-rewet-method 3 int nbc = 0; for( int n=0; n<region->Getnp(); n++ ) { NODE* nd = region->Getnode(n); if( isFS(nd->flag, NODE::kBound) ) nbc++; } sprintf( text,"\n (MODEL::Boundary) number of boundary elements: %d\n", nb ); REPORT::rpt.Output( text, 3 ); # ifdef kDebug { int pid; MPI_Comm_rank( MPI_COMM_WORLD, &pid ); char fname[40]; sprintf( fname, "bound_%02d.inp", pid+1 ); FILE* id = fopen( fname, "w" ); for( int n=0; n<region->Getnp(); n++ ) { NODE* nd = region->Getnode(n); CF( nd->flag, NODE::kMarker ); } for( int e=0; e<bound->Getne(); e++ ) { ELEM* el = bound->Getelem(e); for( int i=0; i<el->getnnd(); i++ ) { SF( el->nd[i]->flag, NODE::kMarker ); } } int j = 0; for( int n=0; n<region->Getnp(); n++ ) { NODE* nd = region->Getnode(n); if( isFS(nd->flag, NODE::kMarker) ) j++; } fprintf( id, "%6d %6d 0 0 0\n", j, nb ); for( int n=0; n<region->Getnp(); n++ ) { NODE* nd = region->Getnode(n); if( isFS(nd->flag, NODE::kMarker) ) { fprintf( id, "%6d %17.9le %17.9le %17.9le\n", nd->Getname(), nd->x, nd->y, nd->z ); } } for( int e=0; e<bound->Getne(); e++ ) { ELEM* el = bound->Getelem(e); fprintf( id, "%6d %3d line %6d %6d %6d\n", el->Getname(), TYPE::getid(el->type), el->nd[0]->Getname(), el->nd[1]->Getname(), el->nd[2]->Getname() ); } fclose( id ); } # endif }
//removes the lowest energy vertical seam from the image void removeHorizontalSeam() { double energies[3]; double min_energy; int prev_x; int prev_y; // split up work between processes double *my_path_costs; double *my_previous_x; double *my_previous_y; double *temp_path_costs; double *temp_previous_x; double *temp_previous_y; int my_rows = current_height / numprocs; int low_rows = my_rows; int extra_rows = current_height % numprocs; int start; int y_offset; int recv_rows; double top_end_cost, bottom_end_cost, temp_end_cost; if (rank < extra_rows) { my_rows++; start = rank * my_rows; } else { start = (extra_rows * (my_rows + 1)) + ((rank - extra_rows) * my_rows); } my_path_costs = (double *) malloc(my_rows * current_width * sizeof(double)); my_previous_x = (double *) malloc(my_rows * current_width * sizeof(double)); my_previous_y = (double *) malloc(my_rows * current_width * sizeof(double)); //find the lowest cost seam by computing the lowest cost paths to each pixel for (int x = 0; x < current_width; x++) { //compute the path costs for my rows for (int y = start; y < start + my_rows; y++) { if (x == 0) { path_costs[x * initial_height + y] = image_energy[x * initial_height + y]; my_path_costs[(y - start) * current_width + x] = path_costs[x * initial_height + y]; previous_x[x * initial_height + y] = -1; my_previous_x[(y - start) * current_width + x] = previous_x[x * initial_height + y]; previous_y[x * initial_height + y] = -1; my_previous_y[(y - start) * current_width + x] = previous_y[x * initial_height + y]; } else { //the pixel directly left energies[1] = path_costs[(x - 1) * initial_height + y]; //pixel left and above if (y != 0) { energies[0] = path_costs[(x - 1) * initial_height + y - 1]; } else { energies[0] = DBL_MAX; } //pixel left and below if (y != current_height - 1) { energies[2] = path_costs[(x - 1) * initial_height + y + 1]; } else { energies[2] = DBL_MAX; } //find the one with the least path cost min_energy = energies[0]; prev_x = x - 1; prev_y = y - 1; if (energies[1] < min_energy) { min_energy = energies[1]; prev_y = y; } if (energies[2] < min_energy) { min_energy = energies[2]; prev_y = y + 1; } //set the minimum path cost for this pixel path_costs[x * initial_height + y] = min_energy + image_energy[x * initial_height + y]; my_path_costs[(y - start) * current_width + x] = path_costs[x * initial_height + y]; //set the previous pixel on the minimum path's coordinates for this pixel previous_x[x * initial_height + y] = prev_x; my_previous_x[(y - start) * current_width + x] = previous_x[x * initial_height + y]; previous_y[x * initial_height + y] = prev_y; my_previous_y[(y - start) * current_width + x] = previous_y[x * initial_height + y]; } } //send path cost needed to neighboring processes if (numprocs > 1) { if (rank != numprocs - 1) { //send bottom most cost to following process bottom_end_cost = path_costs[x * initial_height + (start + my_rows - 1)]; MPI_Send(&bottom_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD); //receive following process's top most cost MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); path_costs[x * initial_height + (start + my_rows)] = temp_end_cost; } if (rank != 0) { //send top most cost to preceding process top_end_cost = path_costs[x * initial_height + start]; MPI_Send(&top_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD); //receive preceding process's bottom most cost MPI_Recv(&temp_end_cost, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); path_costs[x * initial_height + (start - 1)] = temp_end_cost; } } } //update paths costs for all processes for (int i = 0; i < numprocs; i++) { if (rank == i) { continue; } if (i < extra_rows) { y_offset = i * (low_rows + 1); recv_rows = low_rows + 1; } else { y_offset = (extra_rows * (low_rows + 1)) + ((i - extra_rows) * low_rows); recv_rows = low_rows; } //printf("%d %d\n", low_rows, extra_rows); //printf("%d %d %d\n", rank, y_offset, recv_rows); temp_path_costs = (double *) malloc(recv_rows * current_width * sizeof(double)); temp_previous_x = (double *) malloc(recv_rows * current_width * sizeof(double)); temp_previous_y = (double *) malloc(recv_rows * current_width * sizeof(double)); MPI_Sendrecv(my_path_costs, my_rows * current_width, MPI_DOUBLE, i, 0, temp_path_costs, recv_rows * current_width, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Sendrecv(my_previous_x, my_rows * current_width, MPI_DOUBLE, i, 1, temp_previous_x, recv_rows * current_width, MPI_DOUBLE, i, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Sendrecv(my_previous_y, my_rows * current_width, MPI_DOUBLE, i, 2, temp_previous_y, recv_rows * current_width, MPI_DOUBLE, i, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE); for (int j = 0; j < recv_rows; j++) { int x = j / recv_rows; int y = y_offset + (j % recv_rows); //printf("%d %d %d %d %d\n", rank, x, y, x * initial_height + y, recv_rows * current_width); //printf("%d\n", initial_height * initial_width); path_costs[x * initial_height + y] = temp_path_costs[(y - y_offset) * current_width + x]; previous_x[x * initial_height + y] = temp_previous_x[(y - y_offset) * current_width + x]; previous_y[x * initial_height + y] = temp_previous_y[(y - y_offset) * current_width + x]; } free(temp_path_costs); free(temp_previous_x); free(temp_previous_y); } free(my_path_costs); free(my_previous_x); free(my_previous_y); //find the ycoord the lowest cost seam starts at the right of the current image int y_coord = 0; for (int y = 0; y < current_height; y++) { if (path_costs[(current_width - 1) * initial_height + y] < path_costs[(current_width - 1) * initial_height + y_coord]) { y_coord = y; } } //delete the seam from right to left for (int x = current_width - 1; x >= 0; x--) { //delete this pixel by copying over it and all those following to the bottom for (int y = y_coord; y < current_height - 1; y++) { image[x * initial_height + y] = image[x * initial_height + y + 1]; } //next pixel y_coord = previous_y[x * initial_height + y_coord]; } //decrease the current height of the image current_height--; }
/** MPI communication routine for exchanging (double-precision) values of particle distribution functions across the boundaries between different MPI processes (or MPI ranks) After calling this function, values in the ghost layers for {f0, f1, ..., f18} get updated using values from neighboring MPI processes */ void exchangePDF (const int nn, // number of ghost cell layers const int Q, // number of LBM streaming directions const int MX, // number of voxels along X in this process const int MY, // number of voxels along Y in this process const int MZ, // number of voxels along Z in this process const int myid, // my process id const MPI_Comm CART_COMM, // Cartesian topology communicator const int nbr_WEST, // process id of my western neighbor const int nbr_EAST, // process id of my eastern neighbor const int nbr_SOUTH, // process id of my southern neighbor const int nbr_NORTH, // process id of my northern neighbor const int nbr_BOTTOM, // process id of my bottom neighbor const int nbr_TOP, // process id of my top neighbor double *PDF4d) // pointer to the 4D array being exchanged (of type double) { MPI_Status status; const int MXP = nn+MX+nn; // padded voxels along X const int MYP = nn+MY+nn; // padded voxels along Y const int MZP = nn+MZ+nn; // padded voxels along Z // regular voxels + voxels in the ghost layer const int PADDED_VOXELS = MXP*MYP*MZP; // allocate a 3D array for storing f(a) // ghost layers are included in this 3D array double *PDF3d = new double[PADDED_VOXELS]; // loop for all PDF directions for (int a = 0; a < Q; a++) { // loop over all voxels in this MPI process, including ghost layers for(int i = 0; i < MXP; i++) { for(int j = 0; j < MYP; j++) { for(int k = 0; k < MZP; k++) { // natural index for fa(i,j,k) in PDF3d int index_3d = i + j*MXP + k*MXP*MYP; // natural index for f(i,j,k,a) in PDF4d int index_4d = a + (index_3d * Q); // PDF3d <---- PDF4d(a) PDF3d[index_3d] = PDF4d[index_4d]; } } } // MPI_Datatype stridex; MPI_Type_vector( (MY+nn+nn)*(MZ+nn+nn), 1, MX+nn+nn, MPI_DOUBLE, &stridex); MPI_Type_commit( &stridex); // MPI_Datatype stridey; MPI_Type_vector( MZ+nn+nn, MX+nn+nn, (MY+nn+nn)*(MX+nn+nn), MPI_DOUBLE, &stridey); MPI_Type_commit( &stridey); // total number of values in a XY plane (contiguous values) // // example layout for the case nn = 1 (1 layer of ghost cells) // // // 0,MY+1 1,MY+1 2,MY+1 3,MY+1 ... MX,MY+1 MX+1,MY+1 // // +----------------------- ----------+ // | | // 0,MY | 1,MY 2,MY 3,MY ... MX,MY | MX+1,MY // | | // // // // // // 0,2 | 2,2 2,2 3,2 ... MX,2 | MX+1,2 // | | // | | // | | // 0,1 | 1,1 2,1 3,1 ... MX,1 | MX+1,1 // | | // +----------------------- ---------+ // // 0,0 1,0 2,0 3,0 ... MX,0 MX+1,0 // // int no_xy = MXP*MYP; // we are only exchanging one PDF at a time // loop over the number of ghost layers for(int i = 0; i < nn; i++) { // I am sending PDF3d data to the process nbr_TOP and receiving PDF3d data from the process nbr_BOTTOM { // x x x x x x // send the topmost (non-ghost) layer of data +---------------+ // receive this data into the ghost cell layer S | S S S S | S --- send to nbr_TOP // | | // ^ x | o o o o | x // | Z-axis | | // | x | o o o o | x // | +---------------+ // | R R R R R R --- recv from nbr_BOTTOM // SEND to top int sx = 0; int sy = 0; int sz = nn + (MZ-1) - i; // RECV from bottom int rx = 0; int ry = 0; int rz = (nn - 1) - i; int send = sx + sy * MXP + sz * MXP*MYP; // send the topmost (non-ghost) layer of data int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the bottom ghost cell layer MPI_Sendrecv(&PDF3d[send], // send buffer (points to the starting address of the data chunk) no_xy, // number of elements to be sent MPI_DOUBLE, // type of elements nbr_TOP, // destination (where the data is going) 111, // tag &PDF3d[recv], // receive buffer (points to the starting address of the data chunk) no_xy, // number of elements received MPI_DOUBLE, // type of elements nbr_BOTTOM, // source (where the data is coming from) 111, // tag CART_COMM, // MPI Communicator used for this Sendrecv &status); // MPI status } // I am sending PDF3d data to the process nbr_BOTTOM and receiving PDF3d data from the process nbr_TOP { // R R R R R R --- recv from nbr_TOP // send the topmost (non-ghost) layer of data +---------------+ // receive this data into the ghost cell layer x | o o o o | x // | | // ^ x | o o o o | x // | Z-axis | | // | S | S S S S | S --- send to nbr_BOTTOM // | +---------------+ // | x x x x x x // SEND to bottom int sx = 0; int sy = 0; int sz = nn + i; // RECV from top int rx = 0; int ry = 0; int rz = nn + MZ + i; int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer MPI_Sendrecv(&PDF3d[send], // send buffer (points to the starting address of the data chunk) no_xy, // number of elements to be sent MPI_DOUBLE, // type of elements nbr_BOTTOM, // destination (where the data is going) 222, // tag &PDF3d[recv], // receive buffer (points to the starting address of the data chunk) no_xy, // number of elements received MPI_DOUBLE, // type of elements nbr_TOP, // source (where the data is coming from) 222, // tag CART_COMM, // MPI Communicator used for this Sendrecv &status); // MPI status } // I am sending PDF3d data to the process nbr_EAST and receiving PDF3d data from process nbr_WEST { // R x x x S x // send the eastmost (non-ghost) layer of data +---------------+ // receive data into the west ghost cell layer R | o o o S | x S --- send to nbr_EAST // | | // R | o o o S | x // --------------> X-axis | | // R | o o o S | x // +---------------+ // R x x x S x R --- recv from nbr_WEST // SEND to east int sx = nn + (MX-1) - i; int sy = 0; int sz = 0; // RECV from west int rx = (nn - 1) - i; int ry = 0; int rz = 0; int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer MPI_Sendrecv(&PDF3d[send], // send buffer (points to the starting address of the data chunk) 1, // number of elements to be sent stridex, // type of elements nbr_EAST, // destination (where the data is going) 333, // tag &PDF3d[recv], // receive buffer (points to the starting address of the data chunk) 1, // number of elements received stridex, // type of elements nbr_WEST, // source (where the data is coming from) 333, // tag CART_COMM, // MPI Communicator used for this Sendrecv &status); // MPI status } // I am sending PDF3d data to the process nbr_WEST and receiving PDF3d data from process nbr_EAST { // x S x x x R // send the westmost (non-ghost) layer of data +---------------+ // receive data into the east ghost cell layer x | S o o o | R S --- send to nbr_WEST // | | // x | S o o o | R // --------------> X-axis | | // x | S o o o | R // +---------------+ // x S x x x R R --- recv from nbr_EAST // SEND to west int sx = nn + i; int sy = 0; int sz = 0; // RECV from east int rx = nn + MX + i; int ry = 0; int rz = 0; int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer MPI_Sendrecv(&PDF3d[send], // send buffer (points to the starting address of the data chunk) 1, // number of elements to be sent stridex, // type of elements nbr_WEST, // destination (where the data is going) 444, // tag &PDF3d[recv], // receive buffer (points to the starting address of the data chunk) 1, // number of elements received stridex, // type of elements nbr_EAST, // source (where the data is coming from) 444, // tag CART_COMM, // MPI Communicator used for this Sendrecv &status); // MPI status } // I am sending PDF3d data to the process nbr_NORTH and receiving PDF3d data from process nbr_SOUTH { // R x x x S x // send the northmost (non-ghost) layer of data +---------------+ // receive data into the south ghost cell layer R | o o o S | x S --- send to nbr_NORTH // | | // R | o o o S | x // --------------> Y-axis | | // R | o o o S | x // +---------------+ // R x x x S x R --- recv from nbr_SOUTH // SEND to north int sx = 0; int sy = nn + (MY-1) - i; int sz = 0; // RECV from south int rx = 0; int ry = (nn - 1) - i; int rz = 0; int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer MPI_Sendrecv(&PDF3d[send], // send buffer (points to the starting address of the data chunk) 1, // number of elements to be sent stridey, // type of elements nbr_NORTH, // destination (where the data is going) 555, // tag &PDF3d[recv], // receive buffer (points to the starting address of the data chunk) 1, // number of elements received stridey, // type of elements nbr_SOUTH, // source (where the data is coming from) 555, // tag CART_COMM, // MPI Communicator used for this Sendrecv &status); // MPI status } // I am sending PDF3d data to the process nbr_SOUTH and receiving PDF3d data from process nbr_NORTH { // x S x x x R // send the southmost (non-ghost) layer of data +---------------+ // receive data into the north ghost cell layer x | S o o o | R S --- send to nbr_SOUTH // | | // x | S o o o | R // --------------> Y-axis | | // x | S o o o | R // +---------------+ // x S x x x R R --- recv from nbr_NORTH // SEND to south int sx = 0; int sy = nn + i; int sz = 0; // RECV from north int rx = 0; int ry = nn + MY + i; int rz = 0; int send = sx + sy * MXP + sz * MXP*MYP; // send the bottommost (non-ghost) layer of data int recv = rx + ry * MXP + rz * MXP*MYP; // receive data into the top ghost cell layer MPI_Sendrecv(&PDF3d[send], // send buffer (points to the starting address of the data chunk) 1, // number of elements to be sent stridey, // type of elements nbr_SOUTH, // destination (where the data is going) 666, // tag &PDF3d[recv], // receive buffer (points to the starting address of the data chunk) 1, // number of elements received stridey, // type of elements nbr_NORTH, // source (where the data is coming from) 666, // tag CART_COMM, // MPI Communicator used for this Sendrecv &status); // MPI status } } // end for loop over the number of ghost layers // loop over all voxels in this MPI process, including ghost layers for(int i = 0; i < MXP; i++) { for(int j = 0; j < MYP; j++) { for(int k = 0; k < MZP; k++) { // natural index for fa(i,j,k) in PDF3d int index_3d = i + j*MXP + k*MXP*MYP; // natural index for f(i,j,k,a) in PDF4d int index_4d = a + (index_3d * Q); // PDF4d <---- PDF3d(a) PDF4d[index_4d] = PDF3d[index_3d]; } } } // cleanup MPI_Type_free(&stridex); MPI_Type_free(&stridey); } // end loop for PDF directions // free memory for the temporary 3D array delete [] PDF3d; }
/* ********************************************************************* */ int AL_Exchange_periods (void *vbuf, int *periods, int sz_ptr) /*! * Same as AL_Exchange, but exchanges periodic * boundaries at physical domain in the dim direction * only if periods[dim] = 1. * If a dimension is not periodic and periods[dim] = 1 * nothing changes. * * \param [in] vbuf pointer to buffer * \param [in] periods * \param [in] sz_ptr integer pointer to the distributed array descriptor *********************************************************************** */ { char *buf; register int nd; int myrank, nproc; int ndim, gp, nleft, nright, tag1, tag2; int sendb, recvb; MPI_Datatype itype; MPI_Comm comm; MPI_Status status; SZ *s; int is_beg[3], is_end[3]; buf = (char *) vbuf; /* -- DIAGNOSTICS Check that sz_ptr points to an allocated SZ -- */ if( stack_ptr[sz_ptr] == AL_STACK_FREE){ printf("AL_Decompose: wrong SZ pointer\n"); } s = sz_stack[sz_ptr]; myrank = s->rank; nproc = s->size; comm = s->comm; ndim = s->ndim; AL_Is_boundary (sz_ptr, is_beg, is_end); for(nd=0;nd<ndim;nd++){ gp = s->bg[nd]; /* If gp=0, do nothing */ if( gp > 0 ){ nleft = s->left[nd]; nright = s->right[nd]; if (is_beg[nd] && periods[nd] == 0) nleft = MPI_PROC_NULL; if (is_end[nd] && periods[nd] == 0) nright = MPI_PROC_NULL; itype = s->type_rl[nd]; tag1 = s->tag1[nd]; sendb = s->sendb1[nd]; recvb = s->recvb1[nd]; MPI_Sendrecv(&buf[sendb], 1, itype, nleft, tag1, &buf[recvb], 1, itype, nright,tag1, comm, &status); nleft = s->left[nd]; nright = s->right[nd]; if (is_beg[nd] && periods[nd] == 0) nleft = MPI_PROC_NULL; if (is_end[nd] && periods[nd] == 0) nright = MPI_PROC_NULL; itype = s->type_lr[nd]; tag2 = s->tag2[nd]; sendb = s->sendb2[nd]; recvb = s->recvb2[nd]; MPI_Sendrecv(&buf[sendb], 1, itype, nright, tag2, &buf[recvb], 1, itype, nleft,tag2, comm, &status); } } /* DIAGNOSTICS */ #ifdef DEBUG printf("AL_Exchange: filled ghost regions\n"); #endif return (int) AL_SUCCESS; }
void cs_find_hot_neighbours(void) { MyFloat *Left, *Right; int nimport; int i, j, n, ndone_flag, dummy; int ndone, ntot, npleft; int iter = 0; int ngrp, sendTask, recvTask; int place, nexport; double dmax1, dmax2; double xhyd, yhel, ne, mu, energy, temp; double a3inv; if(All.ComovingIntegrationOn) a3inv = 1 / (All.Time * All.Time * All.Time); else a3inv = 1; /* allocate buffers to arrange communication */ Left = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat)); Right = (MyFloat *) mymalloc(NumPart * sizeof(MyFloat)); Ngblist = (int *) mymalloc(NumPart * sizeof(int)); All.BunchSize = (int) ((All.BufferSize * 1024 * 1024) / (sizeof(struct data_index) + sizeof(struct data_nodelist) + sizeof(struct hotngbs_in) + sizeof(struct hotngbs_out) + sizemax(sizeof(struct hotngbs_in), sizeof(struct hotngbs_out)))); DataIndexTable = (struct data_index *) mymalloc(All.BunchSize * sizeof(struct data_index)); DataNodeList = (struct data_nodelist *) mymalloc(All.BunchSize * sizeof(struct data_nodelist)); CPU_Step[CPU_MISC] += measure_time(); for(n = FirstActiveParticle; n >= 0; n = NextActiveParticle[n]) { if(P[n].Type == 0) { /* select reservoir and cold phase particles */ if(P[n].EnergySN > 0 && SphP[n].d.Density * a3inv > All.PhysDensThresh * All.DensFrac_Phase) { xhyd = P[n].Zm[6] / P[n].Mass; yhel = (1 - xhyd) / (4. * xhyd); ne = SphP[n].Ne; mu = (1 + 4 * yhel) / (1 + yhel + ne); energy = SphP[n].Entropy * P[n].Mass / GAMMA_MINUS1 * pow(SphP[n].d.Density * a3inv, GAMMA_MINUS1); /* Total Energys */ temp = GAMMA_MINUS1 / BOLTZMANN * energy / P[n].Mass * PROTONMASS * mu; temp *= All.UnitEnergy_in_cgs / All.UnitMass_in_g; /* Temperature in Kelvin */ if(temp < All.Tcrit_Phase) { Left[n] = Right[n] = 0; if(!(SphP[n].HotHsml > 0.)) SphP[n].HotHsml = All.InitialHotHsmlFactor * PPP[n].Hsml; /* Estimation of HotHsml : ONLY first step */ P[n].Type = 10; /* temporarily mark particles of interest with this number */ } } } } /* we will repeat the whole thing for those particles where we didn't find enough neighbours */ do { i = FirstActiveParticle; /* beginn with this index */ do { for(j = 0; j < NTask; j++) { Send_count[j] = 0; Exportflag[j] = -1; } /* do local particles and prepare export list */ for(nexport = 0; i >= 0; i = NextActiveParticle[i]) if(P[i].Type == 10 && P[i].TimeBin >= 0) { if(cs_hotngbs_evaluate(i, 0, &nexport, Send_count) < 0) break; } #ifdef MYSORT mysort_dataindex(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #else qsort(DataIndexTable, nexport, sizeof(struct data_index), data_index_compare); #endif MPI_Allgather(Send_count, NTask, MPI_INT, Sendcount_matrix, NTask, MPI_INT, MPI_COMM_WORLD); for(j = 0, nimport = 0, Recv_offset[0] = 0, Send_offset[0] = 0; j < NTask; j++) { Recv_count[j] = Sendcount_matrix[j * NTask + ThisTask]; nimport += Recv_count[j]; if(j > 0) { Send_offset[j] = Send_offset[j - 1] + Send_count[j - 1]; Recv_offset[j] = Recv_offset[j - 1] + Recv_count[j - 1]; } } HotNgbsGet = (struct hotngbs_in *) mymalloc(nimport * sizeof(struct hotngbs_in)); HotNgbsIn = (struct hotngbs_in *) mymalloc(nexport * sizeof(struct hotngbs_in)); /* prepare particle data for export */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; HotNgbsIn[j].Pos[0] = P[place].Pos[0]; HotNgbsIn[j].Pos[1] = P[place].Pos[1]; HotNgbsIn[j].Pos[2] = P[place].Pos[2]; HotNgbsIn[j].HotHsml = SphP[place].HotHsml; HotNgbsIn[j].Entropy = SphP[place].Entropy; memcpy(HotNgbsIn[j].NodeList, DataNodeList[DataIndexTable[j].IndexGet].NodeList, NODELISTLENGTH * sizeof(int)); } for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* get the particles */ MPI_Sendrecv(&HotNgbsIn[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE, recvTask, TAG_DENS_A, &HotNgbsGet[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct hotngbs_in), MPI_BYTE, recvTask, TAG_DENS_A, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } myfree(HotNgbsIn); HotNgbsResult = (struct hotngbs_out *) mymalloc(nimport * sizeof(struct hotngbs_out)); HotNgbsOut = (struct hotngbs_out *) mymalloc(nexport * sizeof(struct hotngbs_out)); /* now do the particles that need to be exported */ for(j = 0; j < nimport; j++) cs_hotngbs_evaluate(j, 1, &dummy, &dummy); if(i < 0) ndone_flag = 1; else ndone_flag = 0; MPI_Allreduce(&ndone_flag, &ndone, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* get the result */ for(ngrp = 1; ngrp < (1 << PTask); ngrp++) { sendTask = ThisTask; recvTask = ThisTask ^ ngrp; if(recvTask < NTask) { if(Send_count[recvTask] > 0 || Recv_count[recvTask] > 0) { /* send the results */ MPI_Sendrecv(&HotNgbsResult[Recv_offset[recvTask]], Recv_count[recvTask] * sizeof(struct hotngbs_out), MPI_BYTE, recvTask, TAG_DENS_B, &HotNgbsOut[Send_offset[recvTask]], Send_count[recvTask] * sizeof(struct hotngbs_out), MPI_BYTE, recvTask, TAG_DENS_B, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } } } /* add the result to the local particles */ for(j = 0; j < nexport; j++) { place = DataIndexTable[j].Index; SphP[place].da.dDensityAvg += HotNgbsOut[j].DensitySum; SphP[place].ea.dEntropyAvg += HotNgbsOut[j].EntropySum; SphP[place].HotNgbNum += HotNgbsOut[j].HotNgbNum; } myfree(HotNgbsOut); myfree(HotNgbsResult); myfree(HotNgbsGet); } while(ndone < NTask); /* do final operations on results */ for(i = FirstActiveParticle, npleft = 0; i >= 0; i = NextActiveParticle[i]) { if(P[i].Type == 10 && P[i].TimeBin >= 0) { #ifdef FLTROUNDOFFREDUCTION SphP[i].da.DensityAvg = FLT(SphP[i].da.dDensityAvg); SphP[i].ea.EntropyAvg = FLT(SphP[i].ea.dEntropyAvg); #endif if(SphP[i].HotNgbNum > 0) { SphP[i].da.DensityAvg /= SphP[i].HotNgbNum; SphP[i].ea.EntropyAvg /= SphP[i].HotNgbNum; } else { SphP[i].da.DensityAvg = 0; SphP[i].ea.EntropyAvg = 0; } /* now check whether we had enough neighbours */ if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation) || (SphP[i].HotNgbNum > (All.DesNumNgb + All.MaxNumHotNgbDeviation))) { /* need to redo this particle */ npleft++; if(Left[i] > 0 && Right[i] > 0) if((Right[i] - Left[i]) < 1.0e-3 * Left[i]) { /* this one should be ok */ npleft--; P[i].TimeBin = -P[i].TimeBin - 1; /* Mark as inactive */ continue; } if(SphP[i].HotNgbNum < (All.DesNumNgb - All.MaxNumHotNgbDeviation)) Left[i] = DMAX(SphP[i].HotHsml, Left[i]); else { if(Right[i] != 0) { if(SphP[i].HotHsml < Right[i]) Right[i] = SphP[i].HotHsml; } else Right[i] = SphP[i].HotHsml; } if(Left[i] > All.MaxHotHsmlParam * PPP[i].Hsml) /* prevent us from searching too far */ { npleft--; P[i].TimeBin = -P[i].TimeBin - 1; /* Mark as inactive */ /* Ad-hoc definition of SAvg and RhoAvg when there are no hot neighbours */ /* Note that a minimum nunmber of hot neighbours are required for promotion, see c_enrichment.c */ if(SphP[i].HotNgbNum == 0) { SphP[i].da.DensityAvg = SphP[i].d.Density / 100; SphP[i].ea.EntropyAvg = SphP[i].Entropy * 1000; printf("WARNING: Used ad-hoc values for SAvg and RhoAvg, No hot neighbours\n"); } continue; } if(iter >= MAXITER_HOT - 10) { printf ("i=%d task=%d ID=%d Hsml=%g Left=%g Right=%g Ngbs=%g Right-Left=%g\n pos=(%g|%g|%g)\n", i, ThisTask, P[i].ID, SphP[i].HotHsml, Left[i], Right[i], (float) SphP[i].HotNgbNum, Right[i] - Left[i], P[i].Pos[0], P[i].Pos[1], P[i].Pos[2]); fflush(stdout); } if(Right[i] > 0 && Left[i] > 0) SphP[i].HotHsml = pow(0.5 * (pow(Left[i], 3) + pow(Right[i], 3)), 1.0 / 3); else { if(Right[i] == 0 && Left[i] == 0) endrun(8188); /* can't occur */ if(Right[i] == 0 && Left[i] > 0) SphP[i].HotHsml *= 1.26; if(Right[i] > 0 && Left[i] == 0) SphP[i].HotHsml /= 1.26; } } else P[i].TimeBin = -P[i].TimeBin - 1; /* Mark as inactive */ } } MPI_Allreduce(&npleft, &ntot, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if(ntot > 0) { iter++; if(iter > 0 && ThisTask == 0) { printf("hotngb iteration %d: need to repeat for %d particles.\n", iter, ntot); fflush(stdout); } if(iter > MAXITER_HOT) { printf("failed to converge in hot-neighbour iteration\n"); fflush(stdout); endrun(1155); } } } while(ntot > 0); myfree(DataNodeList); myfree(DataIndexTable); myfree(Ngblist); myfree(Right); myfree(Left); for(i = FirstActiveParticle; i >= 0; i = NextActiveParticle[i]) if(P[i].Type == 10) { P[i].Type = 0; /* mark as active again */ if(P[i].TimeBin < 0) P[i].TimeBin = -P[i].TimeBin - 1; } CPU_Step[CPU_HOTNGBS] += measure_time(); }
static void Zoltan_RB_Gather( ZOLTAN_GNO_TYPE *send, /* input/output array */ ZOLTAN_GNO_TYPE *tmp_send, /* temporary array */ int proclower, /* smallest numbered processor in partition */ int rank, /* processor number within partition */ int nprocs, /* number of processors in this partition */ MPI_Comm comm /* MPI Communicator */ ) { int tag = 32100; /* message tag */ int partner; /* message partner in binary exchange */ int to; /* message partner not in binary exchange */ int mask; /* mask to determine communication partner */ int nprocs_small; /* largest power of 2 contained in nprocs */ int hbit; /* 2^hbit = nproc_small */ int len; /* message length */ int i; /* loop counter */ MPI_Status status; MPI_Datatype zoltan_gno_mpi_type; zoltan_gno_mpi_type = Zoltan_mpi_gno_type(); /* This routine sums a vector of integers on a subset of processors */ len = 3*nprocs; /* Find next lower power of 2. */ for (hbit = 0; (nprocs >> hbit) != 1; hbit++); nprocs_small = 1 << hbit; if (nprocs_small * 2 == nprocs) { nprocs_small *= 2; hbit++; } to = proclower + (rank ^ nprocs_small); if (rank & nprocs_small) { /* processors greater than largest power of 2 */ MPI_Send(send, len, zoltan_gno_mpi_type, to, tag, comm); tag += hbit + 1; MPI_Recv(send, len, zoltan_gno_mpi_type, to, tag, comm, &status); } else { /* processors within greatest power of 2 */ if (rank + nprocs_small < nprocs) { MPI_Recv(tmp_send, len, zoltan_gno_mpi_type, to, tag, comm, &status); for (i = 0; i < len; i++) send[i] += tmp_send[i]; } for (mask = nprocs_small >> 1; mask; mask >>= 1) { /* binary exchange */ tag++; partner = proclower + (rank ^ mask); /* Change requested by Qingyu Meng <*****@*****.**> to */ /* support mvapich 1.0 on TACC Ranger. */ /* MPI_Send(send, len, zoltan_gno_mpi_type, partner, tag, comm); */ /* MPI_Recv(tmp_send, len, zoltan_gno_mpi_type, partner, tag, comm, &status); */ MPI_Sendrecv(send, len, zoltan_gno_mpi_type, partner, tag, tmp_send, len, zoltan_gno_mpi_type, partner, tag, comm, &status); for (i = 0; i < len; i++) send[i] += tmp_send[i]; } tag++; if (rank + nprocs_small < nprocs) MPI_Send(send, len, zoltan_gno_mpi_type, to, tag, comm); } }
void IMB_sendrecv(struct comm_info* c_info, int size, struct iter_schedule* ITERATIONS, MODES RUN_MODE, double* time) /* MPI-1 benchmark kernel Benchmarks MPI_Sendrecv Input variables: -c_info (type struct comm_info*) Collection of all base data for MPI; see [1] for more information -size (type int) Basic message size in bytes -ITERATIONS (type struct iter_schedule *) Repetition scheduling -RUN_MODE (type MODES) (only MPI-2 case: see [1]) Output variables: -time (type double*) Timing result per sample */ { double t1,t2; int i; Type_Size s_size, r_size; int s_num,r_num; int s_tag, r_tag; int dest, source; MPI_Status stat; #ifdef CHECK defect=0; #endif ierr = 0; /* GET SIZE OF DATA TYPE's in s_size and r_size */ MPI_Type_size(c_info->s_data_type,&s_size); MPI_Type_size(c_info->r_data_type,&r_size); if ((s_size!=0) && (r_size!=0)) { s_num=size/s_size; r_num=size/r_size; } s_tag = 1; r_tag = MPI_ANY_TAG; if(c_info->rank!=-1) { /* CALCULATE SOURCE AND DESTINATION */ dest = (c_info->rank + 1) % (c_info->num_procs); source = (c_info->rank + c_info->num_procs-1) % (c_info->num_procs); for(i=0; i<N_BARR; i++) MPI_Barrier(c_info->communicator); t1 = MPI_Wtime(); for(i=0;i< ITERATIONS->n_sample;i++) { ierr= MPI_Sendrecv((char*)c_info->s_buffer+i%ITERATIONS->s_cache_iter*ITERATIONS->s_offs, s_num,c_info->s_data_type, dest,s_tag, (char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs, r_num,c_info->r_data_type,source,r_tag, c_info->communicator,&stat); MPI_ERRHAND(ierr); CHK_DIFF("Sendrecv",c_info,(char*)c_info->r_buffer+i%ITERATIONS->r_cache_iter*ITERATIONS->r_offs, 0, size, size, asize, put, 0, ITERATIONS->n_sample, i, source, &defect); } t2 = MPI_Wtime(); *time=(t2 - t1)/ITERATIONS->n_sample; } else { *time = 0.; } }