Пример #1
0
void benchmark (long * msg_buffer, int me, int pairs, int nxtpe, MPI_Win win)
{
	static double mr, mr_sum;
	int iters;
	
	if (msg_buffer == NULL) {
		printf("Input buffer is NULL, no reason to proceed\n");
		exit(-1);
	}
	/*
	 * Warmup
	 */
	if (me < pairs) {
		for (int i = 0; i < ITERS_LARGE; i += 1) {
			MPI_Put ((msg_buffer + i*MAX_MSG_SZ), MAX_MSG_SZ, MPI_LONG, nxtpe, i*MAX_MSG_SZ, MAX_MSG_SZ, MPI_LONG, win);
			MPI_Win_flush_local (nxtpe, win);
		}
	}

	MPI_Win_flush_all(win);
	MPI_Barrier(MPI_COMM_WORLD);
	/*
	 * Benchmark
	 */
	for (long size = 1; size <= MAX_MSG_SZ; size <<= 1) {
        	iters = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE;
		mr = message_rate(msg_buffer, size, iters, me, pairs, nxtpe, win);
		MPI_Reduce(&mr, &mr_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
		print_message_rate(size, mr_sum, me);
	}
}
Пример #2
0
/**
   Execute sync_memory
 */
void _XMP_mpi_sync_memory()
{
  if(_XMP_flag_multi_win){
    int num = 0;
    _XMP_coarray_t **coarrays = _XMP_coarray_get_list(&num);
    for(int i = 0; i < num; i++){
      MPI_Win win = coarrays[i]->win;
      if(win != MPI_WIN_NULL){
	XACC_DEBUG("flush_all for host a coarray (%ld)", (long)win);
	MPI_Win_flush_all(win);
	XACC_DEBUG("sync for host a coarray (%ld)", (long)win);
	MPI_Win_sync(win);
      }
#ifdef _XMP_XACC
      MPI_Win win_acc = coarrays[i]->win_acc;
      if(win_acc != MPI_WIN_NULL){
	XACC_DEBUG("flush_all for acc a coarray (%ld)", (long)win_acc);
	MPI_Win_flush_all(win_acc);
	XACC_DEBUG("sync for acc a coarray (%ld)", (long)win_acc);
	MPI_Win_sync(win_acc);
      }
#endif
    }
  }else{
    if(! _is_coarray_win_flushed){
      XACC_DEBUG("flush_all for host single coarray(%ld)", (long)_xmp_mpi_onesided_win);
      MPI_Win_flush_all(_xmp_mpi_onesided_win);

      _is_coarray_win_flushed = true;
    }

    if(! _is_distarray_win_flushed){
      XACC_DEBUG("flush_all for host single distarray(%ld)", (long)_xmp_mpi_distarray_win);
      MPI_Win_flush_all(_xmp_mpi_distarray_win);

      _is_distarray_win_flushed = true;
    }

#ifdef _XMP_XACC
    if(! _is_coarray_win_acc_flushed){
      XACC_DEBUG("flush_all for acc single coarray(%ld)", (long)_xmp_mpi_onesided_win_acc);
      MPI_Win_flush_all(_xmp_mpi_onesided_win_acc);

      _is_coarray_win_acc_flushed = true;
    }

    if(! _is_distarray_win_acc_flushed){
      XACC_DEBUG("flush_all for acc single distarray(%ld)", (long)_xmp_mpi_distarray_win_acc);
      MPI_Win_flush_all(_xmp_mpi_distarray_win_acc);

      _is_distarray_win_acc_flushed = true;
    }
#endif

    _win_sync();
  }
}
Пример #3
0
MTEST_THREAD_RETURN_TYPE run_test(void *arg)
{
    int i;
    double *local_b;

    MPI_Alloc_mem(COUNT * sizeof(double), MPI_INFO_NULL, &local_b);

    for (i = 0; i < LOOPS; i++) {
        MPI_Get(local_b, COUNT, MPI_DOUBLE, 0, 0, COUNT, MPI_DOUBLE, win);
        MPI_Win_flush_all(win);
    }

    MPI_Free_mem(local_b);

    return (MTEST_THREAD_RETURN_TYPE) NULL;
}
Пример #4
0
dart_ret_t dart_flush_all(
  dart_gptr_t gptr)
{
  int16_t seg_id;
  seg_id = gptr.segid;
  MPI_Win win;
  DART_LOG_DEBUG("dart_flush_all() gptr: "
                 "unitid:%d offset:%"PRIu64" segid:%d index:%d",
                 gptr.unitid, gptr.addr_or_offs.offset,
                 gptr.segid,  gptr.flags);
  if (seg_id) {
    uint16_t index = gptr.flags;
    win = dart_win_lists[index];
  } else {
    win = dart_win_local_alloc;
  }
  DART_LOG_TRACE("dart_flush_all: MPI_Win_flush_all");
  MPI_Win_flush_all(win);
  DART_LOG_DEBUG("dart_flush_all > finished");
  return DART_OK;
}
Пример #5
0
/*  garray_put()
 */
int64_t garray_put(garray_t *ga, int64_t *lo, int64_t *hi, void *buf_)
{
    int64_t count = (hi[0] - lo[0]) + 1, length = count * ga->elem_size,
            tlonid, tloidx, thinid, thiidx, tnid, tidx, n, oidx = 0;
    int8_t *buf = (int8_t *)buf_;

    calc_target(ga, lo[0], &tlonid, &tloidx);
    calc_target(ga, hi[0], &thinid, &thiidx);

    /* is all data going to the same target? */
    if (tlonid == thinid) {
        LOG_DEBUG(ga->g->glog, "[%d] garray put %ld-%ld, single target %ld.%ld\n",
                  ga->g->nid, lo[0], hi[0], tlonid, tloidx);

        //MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tlonid, 0, ga->win);
        MPI_Put(buf, length, MPI_INT8_T,
                tlonid, (tloidx * ga->elem_size), length, MPI_INT8_T,
                ga->win);
        //MPI_Win_unlock(tlonid, ga->win);
        MPI_Win_flush(tlonid, ga->win);

        return 0;
    }

    /* put the data into the lo nid */
    n = ga->nelems_per_node + (tlonid < ga->nextra_elems ? 1 : 0) - tloidx;

    LOG_DEBUG(ga->g->glog, "[%d] garray putting %ld elements into %ld.%ld\n",
              ga->g->nid, n, tlonid, tloidx);

    //MPI_Win_lock(MPI_LOCK_SHARED, tlonid, 0, ga->win);
    MPI_Put(buf, length, MPI_INT8_T,
            tlonid, (tloidx * ga->elem_size), (n * ga->elem_size), MPI_INT8_T,
            ga->win);
    //MPI_Win_unlock(tlonid, ga->win);

    oidx = (n*ga->elem_size);

    /* put the data into the in-between nids */
    tidx = 0;
    for (tnid = tlonid + 1;  tnid < thinid;  ++tnid) {
        n = ga->nelems_per_node + (tnid < ga->nextra_elems ? 1 : 0);

        LOG_DEBUG(ga->g->glog, "[%d] garray putting %ld elements into %ld.%ld\n",
                  ga->g->nid, n, tnid, tidx);

        //MPI_Win_lock(MPI_LOCK_EXCLUSIVE, tnid, 0, ga->win);
        MPI_Put(&buf[oidx], (n * ga->elem_size), MPI_INT8_T,
                tnid, 0, (n * ga->elem_size), MPI_INT8_T,
                ga->win);
        //MPI_Win_unlock(tnid, ga->win);

        oidx += (n*ga->elem_size);
    }

    /* put the data into the hi nid */
    n = thiidx + 1;

    LOG_DEBUG(ga->g->glog, "[%d] garray putting %ld elements up to %ld.%ld\n",
              ga->g->nid, n, thinid, thiidx);

    //MPI_Win_lock(MPI_LOCK_EXCLUSIVE, thinid, 0, ga->win);
    MPI_Put(&buf[oidx], (n * ga->elem_size), MPI_INT8_T,
            thinid, 0, (n * ga->elem_size), MPI_INT8_T,
            ga->win);
    //MPI_Win_unlock(thinid, ga->win);
    MPI_Win_flush_all(ga->win);

    return 0;
}
static int run_test(int nop)
{
    int i, x, errs = 0, errs_total = 0;
    MPI_Status stat;
    int dst;
    int winbuf_offset = 0;
    double t0, avg_total_time = 0.0, t_total = 0.0;
    double sum = 0.0;

    if (nprocs <= NPROCS_M) {
        ITER = ITER_S;
    }
    else {
        ITER = ITER_L;
    }

    target_computation_init();
    MPI_Win_lock_all(0, win);

    t0 = MPI_Wtime();
    for (x = 0; x < ITER; x++) {

        // send to all the left processes in a ring style
        for (dst = (rank + 1) % nprocs; dst != rank; dst = (dst + 1) % nprocs) {
            MPI_Accumulate(&locbuf[0], 1, MPI_DOUBLE, dst, rank, 1, MPI_DOUBLE, MPI_SUM, win);
        }
        MPI_Win_flush_all(win);

        target_computation();

        for (dst = (rank + 1) % nprocs; dst != rank; dst = (dst + 1) % nprocs) {
            for (i = 1; i < nop; i++) {
                MPI_Accumulate(&locbuf[i], 1, MPI_DOUBLE, dst, rank, 1, MPI_DOUBLE, MPI_SUM, win);
            }
        }
        MPI_Win_flush_all(win);

        debug_printf("[%d]MPI_Win_flush all done\n", x);
    }
    t_total += MPI_Wtime() - t0;
    t_total /= ITER;

    MPI_Win_unlock_all(win);
    MPI_Barrier(MPI_COMM_WORLD);

    target_computation_exit();

#ifdef CHECK
    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, win);
    sum = 0.0;
    for (i = 0; i < nop; i++) {
        sum += locbuf[i];
    }
    sum *= ITER;
    for (i = 0; i < nprocs; i++) {
        if (i == rank)
            continue;
        if (winbuf[i] != sum) {
            fprintf(stderr,
                    "[%d]computation error : winbuf[%d] %.2lf != %.2lf, nop %d\n",
                    rank, i, winbuf[i], sum, nop);
            errs += 1;
        }
    }
    MPI_Win_unlock(rank, win);
#endif

    MPI_Reduce(&t_total, &avg_total_time, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    MPI_Allreduce(&errs, &errs_total, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

    if (rank == 0) {
        avg_total_time /= nprocs;
#ifdef MTCORE
        fprintf(stdout,
                "mtcore: comp_size %d num_op %d nprocs %d total_time %lf\n",
                DGEMM_SIZE, nop, nprocs, avg_total_time);
#else
        fprintf(stdout,
                "orig: comp_size %d num_op %d nprocs %d total_time %lf\n",
                DGEMM_SIZE, nop, nprocs, avg_total_time);
#endif
    }

    return errs_total;
}
Пример #7
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  long Block_size;         /* size of a single block                */
  long Colblock_size;      /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  long order;              /* order of overall matrix               */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  int i, j, it, jt, istart;/* dummies                               */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double RESTRICT *A_p;    /* original matrix column block          */
  double RESTRICT *B_p;    /* transposed matrix column block        */
  double RESTRICT *Work_in_p;/* workspace for transpose function    */
  double RESTRICT *Work_out_p;/* workspace for transpose function   */
  double abserr,           /* absolute error                        */
         abserr_tot;       /* aggregate absolute error              */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double local_trans_time, /* timing parameters                     */
         trans_time,
         avgtime;
  MPI_Win  rma_win = MPI_WIN_NULL;
  MPI_Info rma_winfo = MPI_INFO_NULL;
  int passive_target = 0;  /* use passive target RMA sync           */
#if MPI_VERSION >= 3
  int  flush_local  = 1;   /* flush local (or remote) after put     */
  int  flush_bundle = 1;   /* flush every <bundle> put calls        */
#endif

/*********************************************************************
** Initialize the MPI environment
*********************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPIRMA matrix transpose: B = A^T\n");

    if (argc <= 3){
      printf("Usage: %s <# iterations> <matrix order> [Tile size]"
             "[sync (0=fence, 1=flush)] [flush local?] [flush bundle]\n",
             *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atol(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %ld should at least # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %ld should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc >= 4) Tile_order     = atoi(*++argv);
    if (argc >= 5) passive_target = atoi(*++argv);
#if MPI_VERSION >= 3
    if (argc >= 6) flush_local    = atoi(*++argv);
    if (argc >= 7) flush_bundle   = atoi(*++argv);
#endif

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %ld\n", order);
    printf("Number of iterations = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
    if (passive_target) {
#if MPI_VERSION < 3
        printf("Synchronization      = MPI_Win_(un)lock\n");
#else
        printf("Synchronization      = MPI_Win_flush%s (bundle=%d)\n", flush_local ? "_local" : "", flush_bundle);
#endif
    } else {
        printf("Synchronization      = MPI_Win_fence\n");
    }
  }

  /*  Broadcast input data to all ranks */
  MPI_Bcast (&order,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast (&iterations,     1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&Tile_order,     1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&passive_target, 1, MPI_INT,  root, MPI_COMM_WORLD);
#if MPI_VERSION >= 3
  MPI_Bcast (&flush_local,    1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&flush_bundle,   1, MPI_INT,  root, MPI_COMM_WORLD);
#endif

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a
** rank.  Each column block is made up of Num_procs smaller square
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

  /* debug message size effects */
  if (my_ID == root) {
    printf("Block_size           = %ld\n", Block_size);
  }

/*********************************************************************
** Create the column block of the test matrix, the row block of the
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  MPI_Info_create (&rma_winfo);
  MPI_Info_set (rma_winfo, "no locks", "true");
  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_out_p = (double *) prk_malloc(Block_size*(Num_procs-1)*sizeof(double));
    if (Work_out_p == NULL){
      printf(" Error allocating space for work_out on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);

    PRK_Win_allocate(Block_size*(Num_procs-1)*sizeof(double), sizeof(double),
                     rma_winfo, MPI_COMM_WORLD, &Work_in_p, &rma_win);
    if (Work_in_p == NULL){
      printf(" Error allocating space for work on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);
  }

#if MPI_VERSION >= 3
  if (passive_target && Num_procs>1) {
    MPI_Win_lock_all(MPI_MODE_NOCHECK,rma_win);
  }
#endif

  /* Fill the original column matrix                                                */
  istart = 0;
  for (j=0;j<Block_order;j++) {
    for (i=0;i<order; i++) {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);

  for (iter = 0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration                                        */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart;
    if (!tiling) {
      for (i=0; i<Block_order; i++) {
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
        }
      }
    } else {
      for (i=0; i<Block_order; i+=Tile_order) {
        for (j=0; j<Block_order; j+=Tile_order) {
          for (it=i; it<MIN(Block_order,i+Tile_order); it++) {
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt);
              A(it,jt) += 1.0;
            }
          }
        }
      }
    }

    if (!passive_target && Num_procs>1) {
      MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, rma_win);
    }

    for (phase=1; phase<Num_procs; phase++){
      send_to = (my_ID - phase + Num_procs)%Num_procs;

      istart = send_to*Block_order;
      if (!tiling) {
        for (i=0; i<Block_order; i++) {
          for (j=0; j<Block_order; j++) {
            Work_out(phase-1,j,i) = A(i,j);
            A(i,j) += 1.0;
          }
        }
      } else {
        for (i=0; i<Block_order; i+=Tile_order) {
          for (j=0; j<Block_order; j+=Tile_order) {
            for (it=i; it<MIN(Block_order,i+Tile_order); it++) {
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(phase-1,jt,it) = A(it,jt);
                A(it,jt) += 1.0;
              }
            }
          }
        }
      }

#if MPI_VERSION < 3
      if (passive_target) {
          MPI_Win_lock(MPI_LOCK_SHARED, send_to, MPI_MODE_NOCHECK, rma_win);
      }
#endif
      MPI_Put(Work_out_p+Block_size*(phase-1), Block_size, MPI_DOUBLE, send_to,
              Block_size*(phase-1), Block_size, MPI_DOUBLE, rma_win);

      if (passive_target) {
#if MPI_VERSION < 3
        MPI_Win_unlock(send_to, rma_win);
#else
        if (flush_bundle==1) {
          if (flush_local==1) {
              MPI_Win_flush_local(send_to, rma_win);
          } else {
              MPI_Win_flush(send_to, rma_win);
          }
        } else if ( (phase%flush_bundle) == 0) {
          /* Too lazy to record all targets, so let MPI do it internally (hopefully) */
          if (flush_local==1) {
              MPI_Win_flush_local_all(rma_win);
          } else {
              MPI_Win_flush_all(rma_win);
          }
        }
#endif
      }
    }  /* end of phase loop for puts  */
    if (Num_procs>1) {
      if (passive_target) {
#if MPI_VERSION >= 3
          MPI_Win_flush_all(rma_win);
#endif
          MPI_Barrier(MPI_COMM_WORLD);
      } else {
          MPI_Win_fence(MPI_MODE_NOSTORE, rma_win);
      }
    }

    for (phase=1; phase<Num_procs; phase++) {
      recv_from = (my_ID + phase)%Num_procs;
      istart = recv_from*Block_order;
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++) {
        for (i=0; i<Block_order; i++) {
          B(i,j) += Work_in(phase-1,i,j);
        }
      }
    } /* end of phase loop for scatters */

    /* for the flush case we need to make sure we have consumed Work_in
       before overwriting it in the next iteration                    */
    if (Num_procs>1 && passive_target) {
      MPI_Barrier(MPI_COMM_WORLD);
    }

  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) {
    for (i=0;i<order; i++) {
      abserr += ABS(B(i,j) - ((double)(order*i + j+colstart)*(iterations+1)+addit));
    }
  }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
    }
    else {
      printf("ERROR: Aggregate absolute error %lf exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (rma_win!=MPI_WIN_NULL) {
#if MPI_VERSION >=3
    if (passive_target) {
      MPI_Win_unlock_all(rma_win);
    }
#endif
    PRK_Win_free(&rma_win);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
Пример #8
0
int main(int argc, char *argv[])
{
    int err, errs = 0;
    int rank, size;
    int minsize = 2, count;
    int i, j;
    MPI_Aint origcount, targetcount;
    MPI_Aint bufsize;
    MPI_Comm comm;
    MPI_Win win;
    MPI_Aint lb, extent;
    MPI_Datatype origtype, targettype;
    DTP_t orig_dtp, target_dtp;
    void *origbuf, *targetbuf;

    MTest_Init(&argc, &argv);

#ifndef USE_DTP_POOL_TYPE__STRUCT       /* set in 'test/mpi/structtypetest.txt' to split tests */
    MPI_Datatype basic_type;
    int len;
    char type_name[MPI_MAX_OBJECT_NAME] = { 0 };

    err = MTestInitBasicSignature(argc, argv, &count, &basic_type);
    if (err)
        return MTestReturnValue(1);

    /* compute bufsize to limit number of comm in test */
    MPI_Type_get_extent(basic_type, &lb, &extent);
    bufsize = extent * count;

    err = DTP_pool_create(basic_type, count, &orig_dtp);
    if (err != DTP_SUCCESS) {
        MPI_Type_get_name(basic_type, type_name, &len);
        fprintf(stdout, "Error while creating orig pool (%s,%d)\n", type_name, count);
        fflush(stdout);
    }

    err = DTP_pool_create(basic_type, count, &target_dtp);
    if (err != DTP_SUCCESS) {
        MPI_Type_get_name(basic_type, type_name, &len);
        fprintf(stdout, "Error while creating target pool (%s,%d)\n", type_name, count);
        fflush(stdout);
    }
#else
    MPI_Datatype *basic_types = NULL;
    int *basic_type_counts = NULL;
    int basic_type_num;


    err = MTestInitStructSignature(argc, argv, &basic_type_num, &basic_type_counts, &basic_types);
    if (err)
        return MTestReturnValue(1);

    /* TODO: ignore bufsize for structs for now;
     *       we need to compute bufsize also for
     *       this case */
    bufsize = 0;

    err = DTP_pool_create_struct(basic_type_num, basic_types, basic_type_counts, &orig_dtp);
    if (err != DTP_SUCCESS) {
        fprintf(stdout, "Error while creating struct pool\n");
        fflush(stdout);
    }

    err = DTP_pool_create_struct(basic_type_num, basic_types, basic_type_counts, &target_dtp);
    if (err != DTP_SUCCESS) {
        fprintf(stdout, "Error while creating struct pool\n");
        fflush(stdout);
    }

    /* this is ignored */
    count = 0;
#endif

    while (MTestGetIntracommGeneral(&comm, minsize, 1)) {
        if (comm == MPI_COMM_NULL)
            continue;

        MPI_Comm_rank(comm, &rank);
        MPI_Comm_size(comm, &size);
        int orig = 0;

        for (i = 0; i < target_dtp->DTP_num_objs; i++) {
            err = DTP_obj_create(target_dtp, i, 0, 0, 0);
            if (err != DTP_SUCCESS) {
                errs++;
                break;
            }

            targetcount = target_dtp->DTP_obj_array[i].DTP_obj_count;
            targettype = target_dtp->DTP_obj_array[i].DTP_obj_type;
            targetbuf = target_dtp->DTP_obj_array[i].DTP_obj_buf;

            MPI_Type_get_extent(targettype, &lb, &extent);

            MPI_Win_create(targetbuf, lb + targetcount * extent,
                           (int) extent, MPI_INFO_NULL, comm, &win);

            for (j = 0; j < orig_dtp->DTP_num_objs; j++) {
                err = DTP_obj_create(orig_dtp, j, 0, 1, count);
                if (err != DTP_SUCCESS) {
                    errs++;
                    break;
                }

                origcount = orig_dtp->DTP_obj_array[j].DTP_obj_count;
                origtype = orig_dtp->DTP_obj_array[j].DTP_obj_type;
                origbuf = orig_dtp->DTP_obj_array[j].DTP_obj_buf;

                if (rank == orig) {
                    int target;
                    MPI_Win_lock_all(0, win);
                    for (target = 0; target < size; target++)
                        if (target != orig) {
                            MPI_Accumulate(origbuf, origcount,
                                           origtype, target, 0,
                                           targetcount, targettype, MPI_REPLACE, win);
                        }

                    MPI_Win_flush_all(win);
                    /*signal to target that the ops are flushed so that it starts checking the result */
                    MPI_Barrier(comm);
                    /*make sure target finishes checking the result before issuing unlock */
                    MPI_Barrier(comm);
                    MPI_Win_unlock_all(win);

                    char *resbuf = (char *) calloc(lb + extent * targetcount, sizeof(char));

                    /*wait for the destination to finish checking and reinitializing the buffer */
                    MPI_Barrier(comm);

                    MPI_Win_lock_all(0, win);
                    for (target = 0; target < size; target++)
                        if (target != orig) {
                            MPI_Get_accumulate(origbuf, origcount,
                                               origtype, resbuf, targetcount,
                                               targettype, target, 0, targetcount,
                                               targettype, MPI_REPLACE, win);

                        }
                    MPI_Win_flush_all(win);
                    /*signal to target that the ops are flushed so that it starts checking the result */
                    MPI_Barrier(comm);
                    /*make sure target finishes checking the result before issuing unlock */
                    MPI_Barrier(comm);
                    MPI_Win_unlock_all(win);
                    free(resbuf);
                } else {
                    /* TODO: add a DTP_buf_set() function to replace this */
                    char *tmp = (char *) calloc(lb + extent * targetcount, sizeof(char));
                    memcpy(tmp, targetbuf, lb + extent * targetcount);

                    MPI_Barrier(comm);
                    MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
                    err = DTP_obj_buf_check(target_dtp, i, 0, 1, count);
                    if (err != DTP_SUCCESS) {
                        errs++;
                    }
                    /* restore target buffer */
                    memcpy(targetbuf, tmp, lb + extent * targetcount);
                    free(tmp);

                    MPI_Barrier(comm);
                    MPI_Win_unlock(rank, win);

                    /*signal the source that checking and reinitialization is done */
                    MPI_Barrier(comm);

                    MPI_Barrier(comm);
                    MPI_Win_lock(MPI_LOCK_SHARED, rank, 0, win);
                    err = DTP_obj_buf_check(target_dtp, i, 0, 1, count);
                    if (err != DTP_SUCCESS) {
                        errs++;
                    }
                    MPI_Barrier(comm);
                    MPI_Win_unlock(rank, win);
                }
                DTP_obj_free(orig_dtp, j);
            }
            MPI_Win_free(&win);
            DTP_obj_free(target_dtp, i);
        }
        MTestFreeComm(&comm);

        /* for large buffers only do one communicator */
        if (MAX_COUNT_SIZE * MAX_TYPE_SIZE < bufsize) {
            break;
        }
    }

    DTP_pool_free(orig_dtp);
    DTP_pool_free(target_dtp);

#ifdef USE_DTP_POOL_TYPE__STRUCT
    /* cleanup array if any */
    if (basic_types) {
        free(basic_types);
    }
    if (basic_type_counts) {
        free(basic_type_counts);
    }
#endif

    MTest_Finalize(errs);
    return MTestReturnValue(errs);
}
Пример #9
0
int main(int argc, char **argv)
{
    FILE    *fp, *fp2;
    char    testName[32] = "MPI_Rget", file1[64], file2[64];
    int     dblSize, proc, nprocs, npairs, partner;
    unsigned int i, j, k, size, localSize, NLOOP = NLOOP_MAX;
    unsigned int smin = MIN_P2P_SIZE, smed = MED_P2P_SIZE, smax = MAX_P2P_SIZE;
    double  tScale = USEC, bwScale = MB_8;
    double  tStart, timeMin, timeMinGlobal, overhead, threshold_lo, threshold_hi;
    double  msgBytes, sizeBytes, localMax, UsedMem;
    double  tElapsed[NREPS], tElapsedGlobal[NREPS];
    double  *A, *B;
    MPI_Win     win;
    MPI_Status  stat;
    MPI_Request req;

    // Initialize parallel environment
    MPI_Init(&argc, &argv);
    MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
    MPI_Comm_rank( MPI_COMM_WORLD, &proc );

    // Test input parameters
    if( nprocs%2 != 0 && proc == 0 )
        fatalError( "P2P test requires an even number of processors" );

    // Check for user defined limits
    checkEnvP2P( proc, &NLOOP, &smin, &smed, &smax );

    // Initialize local variables
    localMax = 0.0;
    npairs   = nprocs/2;
    if( proc < npairs  ) partner = proc + npairs;
    if( proc >= npairs ) partner = proc - npairs;
    UsedMem = (double)smax*(double)sizeof(double)*2.0;

    // Allocate and initialize arrays
    srand( SEED );
    A = doubleVector( smax );
    B = doubleVector( smax );

    // Open output file and write header
    if( proc == 0 ){
        // Check timer overhead in seconds
        timerTest( &overhead, &threshold_lo, &threshold_hi );
        // Open output files and write headers
        sprintf( file1, "rget_time-np_%.4d.dat", nprocs );
        sprintf( file2, "rget_bw-np_%.4d.dat",   nprocs );
        fp  = fopen( file1, "a" );
        fp2 = fopen( file2, "a" );
        printHeaders( fp, fp2, testName, UsedMem, overhead, threshold_lo );
    }

    // Get type size
    MPI_Type_size( MPI_DOUBLE, &dblSize );
    // Set up a window for RMA
    MPI_Win_create( A, smax*dblSize, dblSize, MPI_INFO_NULL, MPI_COMM_WORLD, &win );
    MPI_Win_lock_all( 0, win );
    
    //================================================================
    // Single loop with minimum size to verify that inner loop length  
    // is long enough for the timings to be accurate                     
    //================================================================
    // Warmup with a medium size message
    if( proc < npairs ){
        MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req );
        MPI_Wait( &req, &stat );
        MPI_Win_flush_all( win );
    }
    
    // Test if current NLOOP is enough to capture fastest test cases
    MPI_Barrier( MPI_COMM_WORLD );
    tStart = benchTimer();
    if( proc < npairs ){
        for(j = 0; j < NLOOP; j++){
        	MPI_Rget( B, smin, MPI_DOUBLE, partner, 0, smin, MPI_DOUBLE, win, &req );
        	MPI_Wait( &req, &stat );
        	MPI_Win_flush_all( win );
        }
    }
    timeMin = benchTimer() - tStart;
    MPI_Reduce( &timeMin, &timeMinGlobal, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD );
    if( proc == 0 ) resetInnerLoop( timeMinGlobal, threshold_lo, &NLOOP );
    MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD );


    //================================================================
    // Execute test for each requested size                  
    //================================================================
    for( size = smin; size <= smax; size = size*2 ){

        // Warmup with a medium size message
        if( proc < npairs ){
            MPI_Rget( B, smed, MPI_DOUBLE, partner, 0, smed, MPI_DOUBLE, win, &req );
            MPI_Wait( &req, &stat );
            MPI_Win_flush_all( win );
        }

        // Repeat NREPS to collect statistics
        for(i = 0; i < NREPS; i++){
            MPI_Barrier( MPI_COMM_WORLD );
            tStart = benchTimer();
            if( proc < npairs ){
                for(j = 0; j < NLOOP; j++){
        	        MPI_Rget( B, size, MPI_DOUBLE, partner, 0, size, MPI_DOUBLE, win, &req );
        	        MPI_Wait( &req, &stat );
        	        MPI_Win_flush_all( win );
                }
            }
        	tElapsed[i] = benchTimer() - tStart;
        }
        MPI_Reduce( tElapsed, tElapsedGlobal, NREPS, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD );
        // Only task 0 needs to do the analysis of the collected data
        if( proc == 0 ){
            // sizeBytes is size to write to file
            // msgBytes is actual data exchanged on the wire
            msgBytes  = (double)size*(double)npairs*(double)dblSize;
            sizeBytes = (double)size*(double)dblSize;
            post_process( fp, fp2, threshold_hi, tElapsedGlobal, tScale, 
                          bwScale, size*dblSize, sizeBytes, msgBytes, &NLOOP, 
                          &localMax, &localSize );
        }
        MPI_Bcast( &NLOOP, 1, MPI_INT, 0, MPI_COMM_WORLD );

    }
    MPI_Win_unlock_all( win );
    
    MPI_Win_free( &win );
    MPI_Barrier( MPI_COMM_WORLD );
    free( A );
    free( B );

    //================================================================
    // Print completion message, free memory and exit                  
    //================================================================
    if( proc == 0 ){
        printSummary( fp2, testName, localMax, localSize );
        fclose( fp2 ); 
        fclose( fp );
    }

    MPI_Finalize();
    return 0;
}