Example #1
0
void grid_mpole_work(Particle* particles[XDiv][YDiv][ZDiv]){

    double *fmp, *thetai1, *thetai2, *thetai3;
    int *igrid;

    first_pme = 1;
    if (first_pme){
        recieve_single_info();
        first_pme = 0;
    }
    
    fmp = malloc(10 * npole * sizeof(double));
    igrid = malloc(10 * n * sizeof(int));
    thetai1 = malloc(4 * bsorder * n * sizeof(double));
    thetai2 = malloc(4 * bsorder * n * sizeof(double));
    thetai3 = malloc(4 * bsorder * n * sizeof(double));
    
    double* qgrid = malloc(2 * nfft1 * nfft2 * nfft3 * sizeof(double));
    
    recieve_step_info(fmp, igrid, thetai1, thetai2, thetai3);

    int x, y, z, i;
    for (x = node_boundries[my_rank][0]; x <= node_boundries[my_rank][1]; ++x) {
        for (y = node_boundries[my_rank][2]; y <= node_boundries[my_rank][3]; ++y) {
            for (z = node_boundries[my_rank][4]; z <= node_boundries[my_rank][5]; ++z) {
                for (i = 0; i < block_size; ++i){
                    mpole_math(fmp, igrid, qgrid, thetai1, thetai2, thetai3, bsorder, nfft1, nfft2, nfft3, npole, n, particles[x][y][z][i].index);
                }
            }
        }
    }
    send_grid(qgrid);
    free(fmp); free(igrid); free(thetai1); free(thetai2); free(thetai3); free(qgrid);
}
Example #2
0
int 
main(int argc, char *argv[])
{
   MPI_Comm comm = MPI_COMM_WORLD;  /* Communicator. */
   MPI_Datatype pparams_mpi;         /* Contains all the parameters. */
   MPI_Status   time_sts;
   int nnodes = 5;                  /* Total number of nodes. */
   int gsize[2] = {0};               /* Grid size. */
   int periods[2] = {false, false};
   int rank = 0;
   int coord[2];
   /* We are interested in the diffusion process in two directions. */
   const size_t dims = 2;
   int status;
   int offset[2];
   size_t grains[2];
   int coord_lneigh[2];
   int coord_rneigh[2];
   int coord_uneigh[2];
   int coord_dneigh[2];
   int rank_lneigh;
   int rank_rneigh;
   int rank_uneigh;
   int rank_dneigh;
   MPI_Status xdown_status;
   MPI_Status xup_status;
   MPI_Status yleft_status;
   MPI_Status yright_status;

   double time_start_comm = 0;
   double time_start_comp = 0;
   double time_start_init = 0;
   double time_end_init = 0;
   double time_end_comm = 0;
   double time_end_comp = 0;
   double time_start_total = 0;
   double time_end_total = 0;
   double time_recv_buf;

   size_t yend;
   size_t ystart;

#ifndef NO_SSE
   grid_simd_type   sse_ratio;
   grid_simd_type   sse_ratio1;
   grid_simd_type   curr_grid;
   grid_simd_type   currr_grid;
   grid_simd_type   currl_grid;
   grid_simd_type   curru_grid;
   grid_simd_type   currd_grid;
   grid_simd_type   ngrid_sse;
#endif /* NO_SSE */

   grid_type **grid = NULL;
   grid_type **ngrid = NULL;
   grid_type *xdown = NULL;
   grid_type *xup = NULL;

   grid_type ratio;
   grid_type ratio1;

   /* Arguments. */
   pparams params;

   FILE   *profilefile = NULL;
   FILE   *statusfile = NULL;

   size_t i;
   size_t x, y;
#ifndef NO_SSE
   size_t y_qdl;
   size_t y_qdl_r;
#endif /* NO_SSE */
   long time = 0;

   MPI_Init(&argc, &argv);
   time_start_init = MPI_Wtime();
   time_start_total = MPI_Wtime();
   MPI_Comm_rank(comm, &rank);
   /* Parse the parameters. The function only parses parameters if this
    * processor has rank zero. */
   if ((status = getparams(argc, argv, &params, &profilefile, &statusfile,
               &pparams_mpi, rank)) != EX_OK)
      MPI_Abort(comm, status);

   /* Send all the parameters to the remaining nodes in the comm. */
   MPI_Bcast(&params, 1, pparams_mpi, 0, MPI_COMM_WORLD);

   /* Determine the number of nodes in the communicator. */
   MPI_Comm_size(comm, &nnodes);

   /* Check whether the number of nodes can be used to form a two dimensional
    * grid equal height and length. */
   if (rank == 0 && nnodes / params.l != params.h) {
      usage(); 
   }
   /* The grid should be the same on each node. */
   if (rank == 0 && params.ntotal % (params.l * params.h) != 0) {
      usage();
   }
   /* Compute the grid form. */
   gsize[X_COORD] = params.l;
   gsize[Y_COORD] = params.h;

   /* Create a get information of a Cartesian grid topology. */
   if (MPI_Cart_create(comm, dims, gsize, periods, true, &comm) != 
         MPI_SUCCESS) 
      MPI_Abort(comm, EX_UNAVAILABLE);

   /* Translate the current rank to the coordinate in the Cartesian 
    * topology. */
   MPI_Cart_coords(comm, rank, dims, coord);


   /* Using the coordinate of the current node we can determine the amount of
    * points this node has to compute and the offset of the points. */
   for (i = 0; i < dims; i++) {
      grains[i] = params.ntotal / gsize[i] + (params.ntotal % gsize[i] +
            gsize[i] - coord[i] - 1) / gsize[i];

      if (grains[i] > (size_t)params.ntotal / gsize[i])
         offset[i] = (params.ntotal / gsize[i] + 1) * coord[i];
      else
         offset[i] = params.ntotal / gsize[i] * coord[i] + params.ntotal % gsize[i];
   }

   /* With the current dimensions arrays which represent the grid can be
    * allocated. Two more entries are used to store neighbouring points. 
    *
    * Grids are composed as follows:
    *
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * 0  1  2  ..  ..  ..  grains[x]    grains[x] + 1
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    * |  |  |  |   |   |       |            |
    *
    */
   if ((grid = calloc(grains[X_COORD] + 2, sizeof(grid_type *))) == NULL ||
         (ngrid = calloc(grains[X_COORD] + 2, sizeof(grid_type *))) == NULL)
      MPI_Abort(comm, EX_OSERR);

   for (i = 0; i < grains[X_COORD] + 2; i++)
      if ((grid[i] = calloc(grains[Y_COORD] + 2, sizeof(grid_type))) == NULL ||
            (ngrid[i] = calloc(grains[Y_COORD] + 2, sizeof(grid_type))) == NULL)
         MPI_Abort(comm, EX_OSERR);

   /* Create temporary storage to prevent iterating through the entire grid. */
   if ((xdown = calloc(grains[X_COORD], sizeof(grid_type))) == NULL ||
         (xup = calloc(grains[X_COORD], sizeof(grid_type))) == NULL)
      MPI_Abort(comm, EX_OSERR);

   if ((params.dt * params.D * 4 / (params.dx * params.dx)) > 1)
      ratio = 1;
   else
      ratio = params.dt * params.D / (params.dx * params.dx);

   ratio = 1.0 / 4.0;
   ratio1 = 1.0 - 4.0 * ratio;
#ifndef NO_SSE
#   ifdef DOUBLE
   sse_ratio = _mm_set1_pd(ratio);
   /* This variable is used to reduce the number of computations when computing
    * the finite difference scheme. */
   sse_ratio1 = _mm_set1_pd(1.0 - 4.0 * ratio);
#   else
   sse_ratio = _mm_set_ps1(ratio);
   sse_ratio1 = _mm_set_ps1(1.0 - 4.0 * ratio);
#   endif /* DOUBLE */
#endif /* NO_SSE */

   /* All the coordinates are translated to ranks by first computing the
    * coordinate of the appropriate neighbours. Then the coordinates are
    * used to determine the rank. These ranks can be used for the
    * communication. */
   coord_lneigh[X_COORD] = (coord[X_COORD] + gsize[X_COORD] - 1) % gsize[X_COORD];
   coord_rneigh[X_COORD] = (coord[X_COORD] + 1) % gsize[X_COORD];
   coord_lneigh[Y_COORD] = coord[Y_COORD];
   coord_rneigh[Y_COORD] = coord[Y_COORD];

   coord_dneigh[Y_COORD] = (coord[Y_COORD] + gsize[Y_COORD] - 1) % gsize[Y_COORD];
   coord_uneigh[Y_COORD] = (coord[Y_COORD] + 1) % gsize[Y_COORD];
   coord_dneigh[X_COORD] = coord[X_COORD];
   coord_uneigh[X_COORD] = coord[X_COORD];

   MPI_Cart_rank(comm, coord_lneigh, &rank_lneigh);
   MPI_Cart_rank(comm, coord_rneigh, &rank_rneigh);
   MPI_Cart_rank(comm, coord_dneigh, &rank_dneigh);
   MPI_Cart_rank(comm, coord_uneigh, &rank_uneigh);

   /* Compute by how much the loop iterators have to be adjusted. */
   yend = 1;
   ystart = 1;
   if (coord[Y_COORD] == (gsize[Y_COORD] - 1)) {
      yend--;
      for (x = 1; x < grains[X_COORD] + 1; ++x) 
         grid[x][grains[Y_COORD]] = 1;
   } 

   if (grains[Y_COORD] - yend - ystart < 1)
      MPI_Abort(MPI_COMM_WORLD, EX_USAGE);

   if (coord[Y_COORD] == 0)
      ystart++;

#ifndef NO_SSE
   /* Compute the loop start and end for the SSE instructions. */
   y_qdl =  (grains[Y_COORD] - ystart + yend) / SIMD_CAPACITY;
   y_qdl_r = (grains[Y_COORD] - ystart + yend) % SIMD_CAPACITY;
#endif /* NO_SSE */
   time_end_init = MPI_Wtime() - time_start_init;

   for (time = 0; time < params.ttotal; time++)
   {
      /* Create two new arrays to prevent bad memory access. */
      for (i = 0; i < grains[X_COORD]; i++) {
         xup[i] = grid[i + 1][grains[Y_COORD]];
         xdown[i] = grid[i + 1][1];
      }

      time_start_comm = MPI_Wtime();
      MPI_Send((void *)xdown, grains[X_COORD], MPI_GRID_TYPE, rank_dneigh, X_DOWN_TAG, comm);
      MPI_Send((void *)xup, grains[X_COORD], MPI_GRID_TYPE, rank_uneigh, X_UP_TAG, comm);

      MPI_Recv((void *)xdown, grains[X_COORD], MPI_GRID_TYPE, rank_dneigh,
            X_UP_TAG, comm, &xdown_status);
      MPI_Recv((void *)xup, grains[X_COORD], MPI_GRID_TYPE, rank_uneigh,
            X_DOWN_TAG, comm, &xup_status);

      time_end_comm += MPI_Wtime() - time_start_comm;

      /* The freshly received xup and xdown have to be put in the grid. */
      for (i = 0; i < grains[X_COORD]; i++) {
         grid[i + 1][grains[Y_COORD] + 1] = xup[i];
         grid[i + 1][0] = xdown[i];
      }

      time_start_comm = MPI_Wtime();
      MPI_Send((void *)(grid[grains[X_COORD]] + 1), grains[Y_COORD], MPI_GRID_TYPE,
            rank_rneigh, Y_RIGHT_TAG, comm);
      MPI_Send((void *)(grid[1] + 1), grains[Y_COORD], MPI_GRID_TYPE,
            rank_lneigh, Y_LEFT_TAG, comm);
      MPI_Recv((void *)(grid[0] + 1), grains[Y_COORD], MPI_GRID_TYPE,
            rank_lneigh, Y_RIGHT_TAG, comm, &yright_status);
      MPI_Recv((void *)(grid[grains[X_COORD] + 1] + 1), grains[Y_COORD], MPI_GRID_TYPE,
            rank_rneigh, Y_LEFT_TAG, comm, &yleft_status);

      time_end_comm += MPI_Wtime() - time_start_comm;

      /* Do a non blocking send of the current grid for printing. */
      if ((time % params.freq) == 0) 
         send_grid(grid, grains, offset, rank, comm, &time_end_comm, PRINT_COMM);

      time_start_comp = MPI_Wtime();
      /* Process the grid per column. */
      for (x = 1; x < grains[X_COORD] + 1; x++) {
#ifdef NO_SSE
         for (y = ystart; y < grains[Y_COORD] + yend; y++) {
            /* Do the finite difference computation. */
            ngrid[x][y] = ratio * (grid[x][y + 1] + grid[x][y - 1]
                  + grid[x + 1][y] + grid[x - 1][y]) 
               + ratio1 *  grid[x][y];
         }
#else
         for (i = 0, y = ystart; i < y_qdl; ++i, y += SIMD_CAPACITY) {
#   ifdef DOUBLE
            /* Load all the necessary values to  SSE2 variables. */
            /* r1 = (x, y + 1)
             * r0 = (x, y)
             */
            curr_grid = _mm_loadu_pd(grid[x] + y);
            /* rr0 = (x + 1, y + 1)
             * rr0 = (x + 1, y) */
            currr_grid = _mm_loadu_pd(grid[x + 1] + y);
            /* rl0 = (x - 1, y + 1)
             * rl0 = (x - 1, y) */
            currl_grid = _mm_loadu_pd(grid[x - 1] + y);
            /* ru0 = (x, y + 2)
             * ru0 = (x, y + 1) */
            curru_grid = _mm_loadu_pd(grid[x] + y + 1);
            /* rd0 = (x, y)
             * rd0 = (x, y - 1) */
            currd_grid = _mm_loadu_pd(grid[x] + y - 1);

            /* Perform arithmetic in an order which should reduce the number of
             * bubbles in the processor pipeline. */
            /* rr = rr + rl */
            currr_grid = _mm_add_pd(currr_grid, currl_grid);
            /* ru = ru + rd */
            curru_grid = _mm_add_pd(curru_grid, currd_grid);
            /* nr = (1 - 4 * ratio) * rr */
            ngrid_sse = _mm_mul_pd(curr_grid, sse_ratio1);
            /* rr = rr + ru */
            currr_grid = _mm_add_pd(currr_grid, curru_grid);
            /* rr += ratio */
            currr_grid = _mm_mul_pd(currr_grid, sse_ratio);
            /* nr += rr */
            ngrid_sse = _mm_add_pd(currr_grid, ngrid_sse);
            /* (x, y + 1) = nr1
             * (x, y) = nr0 */
            _mm_storeu_pd(ngrid[x] + y, ngrid_sse);
#   else
            /* Load all the necessary values to  SSE variables. */
            /* r3 = (x, y + 3)
             * r2 = (x, y + 2)
             * r1 = (x, y + 1)
             * r0 = (x, y)
             */
            curr_grid = _mm_loadu_ps(grid[x] + y);
            currr_grid = _mm_loadu_ps(grid[x + 1] + y);
            currl_grid = _mm_loadu_ps(grid[x - 1] + y);
            curru_grid = _mm_loadu_ps(grid[x] + y + 1);
            currd_grid = _mm_loadu_ps(grid[x] + y - 1);

            /* Perform arithmetic in an order which should reduce the number of
             * bubbles in the processor pipeline. */
            currr_grid = _mm_add_ps(currr_grid, currl_grid);
            curru_grid = _mm_add_ps(curru_grid, currd_grid);
            ngrid_sse = _mm_mul_ps(curr_grid, sse_ratio1);
            currr_grid = _mm_add_ps(currr_grid, curru_grid);
            currr_grid = _mm_mul_ps(currr_grid, sse_ratio);
            ngrid_sse = _mm_add_ps(currr_grid, ngrid_sse);
            _mm_storeu_ps(ngrid[x] + y, ngrid_sse);
#   endif /* DOUBLE */
         }

         /* Compute the remaining points. */
         for (i = 0; i < y_qdl_r; ++i) {
            ngrid[x][y] = ratio * (grid[x][y + 1] + grid[x][y - 1]
                  + grid[x + 1][y] + grid[x - 1][y]) 
               + ratio1 *  grid[x][y];
            y++;
         }
#endif /* NO_SSE */
      }
      time_end_comp += MPI_Wtime() - time_start_comp;

      if (time % params.freq == 0)
         recv_grid(grid, grains, offset, time, rank, nnodes,
               comm, &time_end_comm, PRINT_COMM, &print_elem,
               (void *)profilefile);

      /* Copy the new grid to the current grid. Use the previously computed
       * y-offsets to determine where copying should start. */
      for (x = 1; x < grains[X_COORD] + 1; ++x) 
         memcpy((void *)(grid[x] + ystart), (void *)(ngrid[x] + ystart),
               (grains[Y_COORD] - (ystart - yend)) * sizeof(grid_type));

      /* Ensure that all the processes are at the same point. */
      MPI_Barrier(comm);
   }

   /* Free the memory used for the grid. */
   for (i = 0; i < grains[X_COORD] + 2; i++) {
      free(grid[i]);
      free(ngrid[i]);
   }

   free(grid);
   free(ngrid);
   free(xup);
   free(xdown);

   if (rank != 0) {
      MPI_Send(&time_end_comm, 1, MPI_DOUBLE, 0, TIME_COMM_TAG, MPI_COMM_WORLD);
      MPI_Send(&time_end_comp, 1, MPI_DOUBLE, 0, TIME_COMP_TAG, MPI_COMM_WORLD);
      MPI_Send(&time_end_init, 1, MPI_DOUBLE, 0, TIME_INIT_TAG, MPI_COMM_WORLD);
   } 

   /* Get all the information on the running time. */
   if (rank == 0) {
      for (i = 1; i < (size_t)nnodes; ++i) {
         MPI_Recv(&time_recv_buf, 1, MPI_DOUBLE, i, TIME_COMM_TAG,
               MPI_COMM_WORLD, &time_sts);
         time_end_comm += time_recv_buf;

         MPI_Recv(&time_recv_buf, 1, MPI_DOUBLE, i, TIME_COMP_TAG,
               MPI_COMM_WORLD, &time_sts);
         time_end_comp += time_recv_buf;

         MPI_Recv(&time_recv_buf, 1, MPI_DOUBLE, i, TIME_INIT_TAG,
               MPI_COMM_WORLD, &time_sts);
         time_end_init += time_recv_buf;
      }
      if (statusfile != NULL) {
         time_end_total = MPI_Wtime() - time_start_total;
         fprintf(statusfile, "%s %i %i %i %li %lf %lf %lf %lf %li\n", argv[0],
               nnodes, gsize[X_COORD], gsize[Y_COORD], sizeof(grid_type),
               time_end_total, time_end_comp,
               time_end_init, time_end_comm, time);
         fclose(statusfile);
      }

      fclose(profilefile);
   }


   MPI_Finalize();
   return EX_OK;
}