void grid_mpole_work(Particle* particles[XDiv][YDiv][ZDiv]){ double *fmp, *thetai1, *thetai2, *thetai3; int *igrid; first_pme = 1; if (first_pme){ recieve_single_info(); first_pme = 0; } fmp = malloc(10 * npole * sizeof(double)); igrid = malloc(10 * n * sizeof(int)); thetai1 = malloc(4 * bsorder * n * sizeof(double)); thetai2 = malloc(4 * bsorder * n * sizeof(double)); thetai3 = malloc(4 * bsorder * n * sizeof(double)); double* qgrid = malloc(2 * nfft1 * nfft2 * nfft3 * sizeof(double)); recieve_step_info(fmp, igrid, thetai1, thetai2, thetai3); int x, y, z, i; for (x = node_boundries[my_rank][0]; x <= node_boundries[my_rank][1]; ++x) { for (y = node_boundries[my_rank][2]; y <= node_boundries[my_rank][3]; ++y) { for (z = node_boundries[my_rank][4]; z <= node_boundries[my_rank][5]; ++z) { for (i = 0; i < block_size; ++i){ mpole_math(fmp, igrid, qgrid, thetai1, thetai2, thetai3, bsorder, nfft1, nfft2, nfft3, npole, n, particles[x][y][z][i].index); } } } } send_grid(qgrid); free(fmp); free(igrid); free(thetai1); free(thetai2); free(thetai3); free(qgrid); }
int main(int argc, char *argv[]) { MPI_Comm comm = MPI_COMM_WORLD; /* Communicator. */ MPI_Datatype pparams_mpi; /* Contains all the parameters. */ MPI_Status time_sts; int nnodes = 5; /* Total number of nodes. */ int gsize[2] = {0}; /* Grid size. */ int periods[2] = {false, false}; int rank = 0; int coord[2]; /* We are interested in the diffusion process in two directions. */ const size_t dims = 2; int status; int offset[2]; size_t grains[2]; int coord_lneigh[2]; int coord_rneigh[2]; int coord_uneigh[2]; int coord_dneigh[2]; int rank_lneigh; int rank_rneigh; int rank_uneigh; int rank_dneigh; MPI_Status xdown_status; MPI_Status xup_status; MPI_Status yleft_status; MPI_Status yright_status; double time_start_comm = 0; double time_start_comp = 0; double time_start_init = 0; double time_end_init = 0; double time_end_comm = 0; double time_end_comp = 0; double time_start_total = 0; double time_end_total = 0; double time_recv_buf; size_t yend; size_t ystart; #ifndef NO_SSE grid_simd_type sse_ratio; grid_simd_type sse_ratio1; grid_simd_type curr_grid; grid_simd_type currr_grid; grid_simd_type currl_grid; grid_simd_type curru_grid; grid_simd_type currd_grid; grid_simd_type ngrid_sse; #endif /* NO_SSE */ grid_type **grid = NULL; grid_type **ngrid = NULL; grid_type *xdown = NULL; grid_type *xup = NULL; grid_type ratio; grid_type ratio1; /* Arguments. */ pparams params; FILE *profilefile = NULL; FILE *statusfile = NULL; size_t i; size_t x, y; #ifndef NO_SSE size_t y_qdl; size_t y_qdl_r; #endif /* NO_SSE */ long time = 0; MPI_Init(&argc, &argv); time_start_init = MPI_Wtime(); time_start_total = MPI_Wtime(); MPI_Comm_rank(comm, &rank); /* Parse the parameters. The function only parses parameters if this * processor has rank zero. */ if ((status = getparams(argc, argv, ¶ms, &profilefile, &statusfile, &pparams_mpi, rank)) != EX_OK) MPI_Abort(comm, status); /* Send all the parameters to the remaining nodes in the comm. */ MPI_Bcast(¶ms, 1, pparams_mpi, 0, MPI_COMM_WORLD); /* Determine the number of nodes in the communicator. */ MPI_Comm_size(comm, &nnodes); /* Check whether the number of nodes can be used to form a two dimensional * grid equal height and length. */ if (rank == 0 && nnodes / params.l != params.h) { usage(); } /* The grid should be the same on each node. */ if (rank == 0 && params.ntotal % (params.l * params.h) != 0) { usage(); } /* Compute the grid form. */ gsize[X_COORD] = params.l; gsize[Y_COORD] = params.h; /* Create a get information of a Cartesian grid topology. */ if (MPI_Cart_create(comm, dims, gsize, periods, true, &comm) != MPI_SUCCESS) MPI_Abort(comm, EX_UNAVAILABLE); /* Translate the current rank to the coordinate in the Cartesian * topology. */ MPI_Cart_coords(comm, rank, dims, coord); /* Using the coordinate of the current node we can determine the amount of * points this node has to compute and the offset of the points. */ for (i = 0; i < dims; i++) { grains[i] = params.ntotal / gsize[i] + (params.ntotal % gsize[i] + gsize[i] - coord[i] - 1) / gsize[i]; if (grains[i] > (size_t)params.ntotal / gsize[i]) offset[i] = (params.ntotal / gsize[i] + 1) * coord[i]; else offset[i] = params.ntotal / gsize[i] * coord[i] + params.ntotal % gsize[i]; } /* With the current dimensions arrays which represent the grid can be * allocated. Two more entries are used to store neighbouring points. * * Grids are composed as follows: * * | | | | | | | | * | | | | | | | | * | | | | | | | | * | | | | | | | | * | | | | | | | | * 0 1 2 .. .. .. grains[x] grains[x] + 1 * | | | | | | | | * | | | | | | | | * | | | | | | | | * | | | | | | | | * | | | | | | | | * */ if ((grid = calloc(grains[X_COORD] + 2, sizeof(grid_type *))) == NULL || (ngrid = calloc(grains[X_COORD] + 2, sizeof(grid_type *))) == NULL) MPI_Abort(comm, EX_OSERR); for (i = 0; i < grains[X_COORD] + 2; i++) if ((grid[i] = calloc(grains[Y_COORD] + 2, sizeof(grid_type))) == NULL || (ngrid[i] = calloc(grains[Y_COORD] + 2, sizeof(grid_type))) == NULL) MPI_Abort(comm, EX_OSERR); /* Create temporary storage to prevent iterating through the entire grid. */ if ((xdown = calloc(grains[X_COORD], sizeof(grid_type))) == NULL || (xup = calloc(grains[X_COORD], sizeof(grid_type))) == NULL) MPI_Abort(comm, EX_OSERR); if ((params.dt * params.D * 4 / (params.dx * params.dx)) > 1) ratio = 1; else ratio = params.dt * params.D / (params.dx * params.dx); ratio = 1.0 / 4.0; ratio1 = 1.0 - 4.0 * ratio; #ifndef NO_SSE # ifdef DOUBLE sse_ratio = _mm_set1_pd(ratio); /* This variable is used to reduce the number of computations when computing * the finite difference scheme. */ sse_ratio1 = _mm_set1_pd(1.0 - 4.0 * ratio); # else sse_ratio = _mm_set_ps1(ratio); sse_ratio1 = _mm_set_ps1(1.0 - 4.0 * ratio); # endif /* DOUBLE */ #endif /* NO_SSE */ /* All the coordinates are translated to ranks by first computing the * coordinate of the appropriate neighbours. Then the coordinates are * used to determine the rank. These ranks can be used for the * communication. */ coord_lneigh[X_COORD] = (coord[X_COORD] + gsize[X_COORD] - 1) % gsize[X_COORD]; coord_rneigh[X_COORD] = (coord[X_COORD] + 1) % gsize[X_COORD]; coord_lneigh[Y_COORD] = coord[Y_COORD]; coord_rneigh[Y_COORD] = coord[Y_COORD]; coord_dneigh[Y_COORD] = (coord[Y_COORD] + gsize[Y_COORD] - 1) % gsize[Y_COORD]; coord_uneigh[Y_COORD] = (coord[Y_COORD] + 1) % gsize[Y_COORD]; coord_dneigh[X_COORD] = coord[X_COORD]; coord_uneigh[X_COORD] = coord[X_COORD]; MPI_Cart_rank(comm, coord_lneigh, &rank_lneigh); MPI_Cart_rank(comm, coord_rneigh, &rank_rneigh); MPI_Cart_rank(comm, coord_dneigh, &rank_dneigh); MPI_Cart_rank(comm, coord_uneigh, &rank_uneigh); /* Compute by how much the loop iterators have to be adjusted. */ yend = 1; ystart = 1; if (coord[Y_COORD] == (gsize[Y_COORD] - 1)) { yend--; for (x = 1; x < grains[X_COORD] + 1; ++x) grid[x][grains[Y_COORD]] = 1; } if (grains[Y_COORD] - yend - ystart < 1) MPI_Abort(MPI_COMM_WORLD, EX_USAGE); if (coord[Y_COORD] == 0) ystart++; #ifndef NO_SSE /* Compute the loop start and end for the SSE instructions. */ y_qdl = (grains[Y_COORD] - ystart + yend) / SIMD_CAPACITY; y_qdl_r = (grains[Y_COORD] - ystart + yend) % SIMD_CAPACITY; #endif /* NO_SSE */ time_end_init = MPI_Wtime() - time_start_init; for (time = 0; time < params.ttotal; time++) { /* Create two new arrays to prevent bad memory access. */ for (i = 0; i < grains[X_COORD]; i++) { xup[i] = grid[i + 1][grains[Y_COORD]]; xdown[i] = grid[i + 1][1]; } time_start_comm = MPI_Wtime(); MPI_Send((void *)xdown, grains[X_COORD], MPI_GRID_TYPE, rank_dneigh, X_DOWN_TAG, comm); MPI_Send((void *)xup, grains[X_COORD], MPI_GRID_TYPE, rank_uneigh, X_UP_TAG, comm); MPI_Recv((void *)xdown, grains[X_COORD], MPI_GRID_TYPE, rank_dneigh, X_UP_TAG, comm, &xdown_status); MPI_Recv((void *)xup, grains[X_COORD], MPI_GRID_TYPE, rank_uneigh, X_DOWN_TAG, comm, &xup_status); time_end_comm += MPI_Wtime() - time_start_comm; /* The freshly received xup and xdown have to be put in the grid. */ for (i = 0; i < grains[X_COORD]; i++) { grid[i + 1][grains[Y_COORD] + 1] = xup[i]; grid[i + 1][0] = xdown[i]; } time_start_comm = MPI_Wtime(); MPI_Send((void *)(grid[grains[X_COORD]] + 1), grains[Y_COORD], MPI_GRID_TYPE, rank_rneigh, Y_RIGHT_TAG, comm); MPI_Send((void *)(grid[1] + 1), grains[Y_COORD], MPI_GRID_TYPE, rank_lneigh, Y_LEFT_TAG, comm); MPI_Recv((void *)(grid[0] + 1), grains[Y_COORD], MPI_GRID_TYPE, rank_lneigh, Y_RIGHT_TAG, comm, &yright_status); MPI_Recv((void *)(grid[grains[X_COORD] + 1] + 1), grains[Y_COORD], MPI_GRID_TYPE, rank_rneigh, Y_LEFT_TAG, comm, &yleft_status); time_end_comm += MPI_Wtime() - time_start_comm; /* Do a non blocking send of the current grid for printing. */ if ((time % params.freq) == 0) send_grid(grid, grains, offset, rank, comm, &time_end_comm, PRINT_COMM); time_start_comp = MPI_Wtime(); /* Process the grid per column. */ for (x = 1; x < grains[X_COORD] + 1; x++) { #ifdef NO_SSE for (y = ystart; y < grains[Y_COORD] + yend; y++) { /* Do the finite difference computation. */ ngrid[x][y] = ratio * (grid[x][y + 1] + grid[x][y - 1] + grid[x + 1][y] + grid[x - 1][y]) + ratio1 * grid[x][y]; } #else for (i = 0, y = ystart; i < y_qdl; ++i, y += SIMD_CAPACITY) { # ifdef DOUBLE /* Load all the necessary values to SSE2 variables. */ /* r1 = (x, y + 1) * r0 = (x, y) */ curr_grid = _mm_loadu_pd(grid[x] + y); /* rr0 = (x + 1, y + 1) * rr0 = (x + 1, y) */ currr_grid = _mm_loadu_pd(grid[x + 1] + y); /* rl0 = (x - 1, y + 1) * rl0 = (x - 1, y) */ currl_grid = _mm_loadu_pd(grid[x - 1] + y); /* ru0 = (x, y + 2) * ru0 = (x, y + 1) */ curru_grid = _mm_loadu_pd(grid[x] + y + 1); /* rd0 = (x, y) * rd0 = (x, y - 1) */ currd_grid = _mm_loadu_pd(grid[x] + y - 1); /* Perform arithmetic in an order which should reduce the number of * bubbles in the processor pipeline. */ /* rr = rr + rl */ currr_grid = _mm_add_pd(currr_grid, currl_grid); /* ru = ru + rd */ curru_grid = _mm_add_pd(curru_grid, currd_grid); /* nr = (1 - 4 * ratio) * rr */ ngrid_sse = _mm_mul_pd(curr_grid, sse_ratio1); /* rr = rr + ru */ currr_grid = _mm_add_pd(currr_grid, curru_grid); /* rr += ratio */ currr_grid = _mm_mul_pd(currr_grid, sse_ratio); /* nr += rr */ ngrid_sse = _mm_add_pd(currr_grid, ngrid_sse); /* (x, y + 1) = nr1 * (x, y) = nr0 */ _mm_storeu_pd(ngrid[x] + y, ngrid_sse); # else /* Load all the necessary values to SSE variables. */ /* r3 = (x, y + 3) * r2 = (x, y + 2) * r1 = (x, y + 1) * r0 = (x, y) */ curr_grid = _mm_loadu_ps(grid[x] + y); currr_grid = _mm_loadu_ps(grid[x + 1] + y); currl_grid = _mm_loadu_ps(grid[x - 1] + y); curru_grid = _mm_loadu_ps(grid[x] + y + 1); currd_grid = _mm_loadu_ps(grid[x] + y - 1); /* Perform arithmetic in an order which should reduce the number of * bubbles in the processor pipeline. */ currr_grid = _mm_add_ps(currr_grid, currl_grid); curru_grid = _mm_add_ps(curru_grid, currd_grid); ngrid_sse = _mm_mul_ps(curr_grid, sse_ratio1); currr_grid = _mm_add_ps(currr_grid, curru_grid); currr_grid = _mm_mul_ps(currr_grid, sse_ratio); ngrid_sse = _mm_add_ps(currr_grid, ngrid_sse); _mm_storeu_ps(ngrid[x] + y, ngrid_sse); # endif /* DOUBLE */ } /* Compute the remaining points. */ for (i = 0; i < y_qdl_r; ++i) { ngrid[x][y] = ratio * (grid[x][y + 1] + grid[x][y - 1] + grid[x + 1][y] + grid[x - 1][y]) + ratio1 * grid[x][y]; y++; } #endif /* NO_SSE */ } time_end_comp += MPI_Wtime() - time_start_comp; if (time % params.freq == 0) recv_grid(grid, grains, offset, time, rank, nnodes, comm, &time_end_comm, PRINT_COMM, &print_elem, (void *)profilefile); /* Copy the new grid to the current grid. Use the previously computed * y-offsets to determine where copying should start. */ for (x = 1; x < grains[X_COORD] + 1; ++x) memcpy((void *)(grid[x] + ystart), (void *)(ngrid[x] + ystart), (grains[Y_COORD] - (ystart - yend)) * sizeof(grid_type)); /* Ensure that all the processes are at the same point. */ MPI_Barrier(comm); } /* Free the memory used for the grid. */ for (i = 0; i < grains[X_COORD] + 2; i++) { free(grid[i]); free(ngrid[i]); } free(grid); free(ngrid); free(xup); free(xdown); if (rank != 0) { MPI_Send(&time_end_comm, 1, MPI_DOUBLE, 0, TIME_COMM_TAG, MPI_COMM_WORLD); MPI_Send(&time_end_comp, 1, MPI_DOUBLE, 0, TIME_COMP_TAG, MPI_COMM_WORLD); MPI_Send(&time_end_init, 1, MPI_DOUBLE, 0, TIME_INIT_TAG, MPI_COMM_WORLD); } /* Get all the information on the running time. */ if (rank == 0) { for (i = 1; i < (size_t)nnodes; ++i) { MPI_Recv(&time_recv_buf, 1, MPI_DOUBLE, i, TIME_COMM_TAG, MPI_COMM_WORLD, &time_sts); time_end_comm += time_recv_buf; MPI_Recv(&time_recv_buf, 1, MPI_DOUBLE, i, TIME_COMP_TAG, MPI_COMM_WORLD, &time_sts); time_end_comp += time_recv_buf; MPI_Recv(&time_recv_buf, 1, MPI_DOUBLE, i, TIME_INIT_TAG, MPI_COMM_WORLD, &time_sts); time_end_init += time_recv_buf; } if (statusfile != NULL) { time_end_total = MPI_Wtime() - time_start_total; fprintf(statusfile, "%s %i %i %i %li %lf %lf %lf %lf %li\n", argv[0], nnodes, gsize[X_COORD], gsize[Y_COORD], sizeof(grid_type), time_end_total, time_end_comp, time_end_init, time_end_comm, time); fclose(statusfile); } fclose(profilefile); } MPI_Finalize(); return EX_OK; }