Example #1
0
File: pic.c Project: afanfa/Kernels
void attach_received_particles(particle_t **dst_buffer, uint64_t *position, uint64_t *buffer_size, particle_t *src_buffer, 
                               uint64_t n_src_particles, particle_t *src_buffer2, uint64_t n_src_particles2)
{
   uint64_t cur_pos = (*position);
   uint64_t cur_buf_size = (*buffer_size);
   particle_t *cur_buffer = (*dst_buffer);
   particle_t *temp_buf;
   
   if ((cur_pos + n_src_particles + n_src_particles2 ) > cur_buf_size) {
      /* Have to resize buffer */
      temp_buf = (particle_t*) prk_malloc((cur_buf_size + 2*(n_src_particles + n_src_particles2)) * sizeof(particle_t));
      if (!temp_buf) {
        printf("Could not increase particle buffer size\n");
        /* do not attempt graceful exit; just allow code to abort */
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      }
      memcpy(temp_buf, cur_buffer, cur_pos*sizeof(particle_t));
      prk_free(cur_buffer);
      cur_buffer = temp_buf;
      (*dst_buffer) = temp_buf;
      (*buffer_size) = cur_buf_size + 2*(n_src_particles + n_src_particles2);
   }
   
   memcpy(&cur_buffer[cur_pos], src_buffer, n_src_particles * sizeof(particle_t));
   (*position) += n_src_particles;
   memcpy(&cur_buffer[*position], src_buffer2, n_src_particles2 * sizeof(particle_t));
   (*position) += n_src_particles2;
}
Example #2
0
File: pic.c Project: afanfa/Kernels
/* Adds a particle to a buffer. Resizes buffer if need be. */
void add_particle_to_buffer(particle_t p, particle_t **buffer, uint64_t *position, uint64_t *buffer_size)
{
   uint64_t cur_pos = (*position);
   uint64_t cur_buf_size = (*buffer_size);
   particle_t *cur_buffer = (*buffer);
   particle_t *temp_buf;

   if (cur_pos == cur_buf_size) {
      /* Have to resize buffer */
      temp_buf = (particle_t*) prk_malloc(2 * cur_buf_size * sizeof(particle_t));
      if (!temp_buf) {
        printf("Could not increase particle buffer size\n");
        /* do not attempt graceful exit; just allow code to abort */
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      }
      memcpy(temp_buf, cur_buffer, cur_buf_size*sizeof(particle_t));
      prk_free(cur_buffer);
      cur_buffer = temp_buf;
      (*buffer) = temp_buf;
      (*buffer_size) = cur_buf_size * 2;
   }
   
   cur_buffer[cur_pos] = p;
   (*position)++;
}
Example #3
0
File: pic.c Project: afanfa/Kernels
/* Resizes a buffer if need be */
void resize_buffer(particle_t **buffer, uint64_t *size, uint64_t new_size)
{
   uint64_t cur_size = (*size);
   
   if (new_size > cur_size) {
      prk_free(*buffer);
      (*buffer) = (particle_t*) prk_malloc(2*new_size*sizeof(particle_t));
      if (!(*buffer)) {
        printf("Could not increase particle buffer size\n");
        /* do not attempt graceful exit; just allow code to abort */
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      }
      (*size) = 2*new_size;
   }

}
Example #4
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  int Block_size;          /* size of a single block                */
  int Colblock_size;       /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  int order;               /* order of overall matrix               */
  int bufferCount;         /* number of input buffers               */
  int targetBuffer;        /* buffer with which to communicate      */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  long i, j, it, jt, istart;/* dummies                              */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double *A_p;             /* original matrix column block          */
  double *B_p;             /* transposed matrix column block        */
  double **Work_in_p;      /* workspace for the transpose function  */
  double *Work_out_p;      /* workspace for the transpose function  */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double avgtime;          /* timing parameters                     */
  long   *pSync_bcast;     /* work space for collectives            */
  long   *pSync_reduce;    /* work space for collectives            */
  double *pWrk;            /* work space for SHMEM collectives      */
  double *local_trans_time, 
         *trans_time;      /* timing parameters                     */
  double *abserr, 
         *abserr_tot;      /* local and aggregate error             */
  int    *send_flag,
         *recv_flag;       /* synchronization flags                 */
  int    *arguments;       /* command line arguments                */

/*********************************************************************
** Initialize the SHMEM environment
*********************************************************************/

  prk_shmem_init();
  my_ID=prk_shmem_my_pe();
  Num_procs=prk_shmem_n_pes();

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("SHMEM matrix transpose: B = A^T\n");
  }

// initialize sync variables for error checks
  pSync_bcast      = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_BCAST_SYNC_SIZE*sizeof(long));
  pSync_reduce     = (long *)   prk_shmem_align(prk_get_alignment(),PRK_SHMEM_REDUCE_SYNC_SIZE*sizeof(long));
  pWrk             = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double) * PRK_SHMEM_REDUCE_MIN_WRKDATA_SIZE);
  local_trans_time = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  trans_time       = (double *) prk_shmem_align(prk_get_alignment(),sizeof(double));
  arguments        = (int *)    prk_shmem_align(prk_get_alignment(),4*sizeof(int));
  abserr           = (double *) prk_shmem_align(prk_get_alignment(),2*sizeof(double));
  abserr_tot       = abserr + 1;
  if (!pSync_bcast || !pSync_reduce || !pWrk || !local_trans_time ||
      !trans_time || !arguments || !abserr) {
    printf("Rank %d could not allocate scalar work space on symm heap\n", my_ID);
    error = 1;
    goto ENDOFTESTS;
  }

  for(i=0;i<PRK_SHMEM_BCAST_SYNC_SIZE;i++)
    pSync_bcast[i]=PRK_SHMEM_SYNC_VALUE;

  for(i=0;i<PRK_SHMEM_REDUCE_SYNC_SIZE;i++)
    pSync_reduce[i]=PRK_SHMEM_SYNC_VALUE;

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    if (argc != 4 && argc != 5){
      printf("Usage: %s <# iterations> <matrix order> <# buffers> [Tile size]\n",
                                                               *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    arguments[0]=iterations;
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atoi(*++argv);
    arguments[1]=order;
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    bufferCount = atoi(*++argv);
    arguments[2]=bufferCount;
    if (Num_procs > 1) {
      if ((bufferCount < 1) || (bufferCount >= Num_procs)) {
        printf("ERROR: bufferCount must be >= 1 and < # procs : %d\n", bufferCount);
        error = 1; goto ENDOFTESTS;
      }
    }

    if (argc == 5) Tile_order = atoi(*++argv);
    arguments[3]=Tile_order;

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    printf("Number of buffers    = %d\n", bufferCount);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
  }
  
  shmem_barrier_all();

  /*  Broadcast input data to all ranks */
  shmem_broadcast32(&arguments[0], &arguments[0], 4, root, 0, 0, Num_procs, pSync_bcast);

  iterations=arguments[0];
  order=arguments[1];
  bufferCount=arguments[2];
  Tile_order=arguments[3];

  shmem_barrier_all();
  prk_shmem_free(arguments);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the row block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_in_p   = (double**)prk_malloc(bufferCount*sizeof(double));
    Work_out_p = (double *) prk_malloc(Block_size*sizeof(double));
    recv_flag  = (int*)     prk_shmem_align(prk_get_alignment(),bufferCount*sizeof(int));
    if ((Work_in_p == NULL)||(Work_out_p==NULL) || (recv_flag == NULL)){
      printf(" Error allocating space for work or flags on node %d\n",my_ID);
      error = 1;
    }

    if (bufferCount < (Num_procs - 1)) {
      send_flag = (int*) prk_shmem_align(prk_get_alignment(), (Num_procs-1) * sizeof(int));

      if (send_flag == NULL) {
	printf("Error allocating space for flags on node %d\n", my_ID);
	error = 1;
      }
    }

    bail_out(error);

    for(i=0;i<bufferCount;i++) {
      Work_in_p[i]=(double *) prk_shmem_align(prk_get_alignment(),Block_size*sizeof(double));
      if (Work_in_p[i] == NULL) {
        printf(" Error allocating space for work on node %d\n",my_ID);
        error = 1;
      }
      bail_out(error);
    }

    if (bufferCount < (Num_procs - 1)) {
      for(i=0;i<(Num_procs-1);i++)
        send_flag[i]=0;
    }

    for(i=0;i<bufferCount;i++)
      recv_flag[i]=0;
  }
  
  /* Fill the original column matrices                                              */
  istart = 0;  
  for (j=0;j<Block_order;j++) 
    for (i=0;i<order; i++)  {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
  }

  shmem_barrier_all();

  if (bufferCount < (Num_procs - 1)) {
    if (Num_procs > 1) {
      for ( i = 0; i < bufferCount; i++) {
        recv_from = (my_ID + i + 1)%Num_procs;
        shmem_int_inc(&send_flag[i], recv_from);
      }
    }
  }

  shmem_barrier_all();

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration                                        */
    if (iter == 1) { 
      shmem_barrier_all();
      local_trans_time[0] = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart; 
    if (!tiling) {
      for (i=0; i<Block_order; i++) 
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
	}
    }
    else {
      for (i=0; i<Block_order; i+=Tile_order) 
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt); 
              A(it,jt) += 1.0;
            }
    }

    for (phase=1; phase<Num_procs; phase++){
      recv_from = (my_ID + phase            )%Num_procs;
      send_to   = (my_ID - phase + Num_procs)%Num_procs;

      targetBuffer = (iter * (Num_procs - 1) + (phase - 1)) % bufferCount;

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=0; i<Block_order; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
            A(i,j) += 1.0;
	  }
      }
      else {
        for (i=0; i<Block_order; i+=Tile_order) 
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
                A(it,jt) += 1.0;
	      }
      }

      if (bufferCount < (Num_procs - 1))
        shmem_int_wait_until(&send_flag[phase-1], SHMEM_CMP_EQ, iter+1);

      shmem_double_put(&Work_in_p[targetBuffer][0], &Work_out_p[0], Block_size, send_to);
      shmem_fence();
      shmem_int_inc(&recv_flag[targetBuffer], send_to);

      i = (iter * (Num_procs - 1) + phase) / bufferCount;

      if ((iter * (Num_procs - 1) + phase) % bufferCount)
	i++;

      shmem_int_wait_until(&recv_flag[targetBuffer], SHMEM_CMP_EQ, i);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) += Work_in(targetBuffer, i,j);

      if (bufferCount < (Num_procs - 1)) {
        if ((phase + bufferCount) < Num_procs)
	  recv_from = (my_ID + phase + bufferCount) % Num_procs;
        else
	  recv_from = (my_ID + phase + bufferCount + 1 - Num_procs) % Num_procs;

        shmem_int_inc(&send_flag[(phase+bufferCount-1)%(Num_procs-1)], recv_from);
      }
    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time[0] = wtime() - local_trans_time[0];

  shmem_barrier_all();
  shmem_double_max_to_all(trans_time, local_trans_time, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  abserr[0] = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) for (i=0;i<order; i++) {
      abserr[0] += ABS(B(i,j) - (double)((order*i + j+colstart)*(iterations+1)+addit));
  }

  shmem_barrier_all();
  shmem_double_sum_to_all(abserr_tot, abserr, 1, 0, 0, Num_procs, pWrk, pSync_reduce);

  if (my_ID == root) {
    if (abserr_tot[0] <= epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time[0]/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr[0]);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr[0], epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (Num_procs>1) 
  {
    if (bufferCount < (Num_procs - 1))
      prk_shmem_free(send_flag);

    prk_shmem_free(recv_flag);
    prk_free(Work_out_p);

    for(i=0;i<bufferCount;i++)
      prk_shmem_free(Work_in_p[i]);

    prk_free(Work_in_p);
  }

  prk_shmem_free(pSync_bcast);
  prk_shmem_free(pSync_reduce);
  prk_shmem_free(pWrk);

  prk_shmem_finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */