Esempio n. 1
0
dart_ret_t dart_flush_local(
  dart_gptr_t gptr)
{
  dart_unit_t target_unitid_abs;
  int16_t seg_id = gptr.segid;
  MPI_Win win;
  target_unitid_abs = gptr.unitid;
  DART_LOG_DEBUG("dart_flush_local() gptr: "
                 "unitid:%d offset:%"PRIu64" segid:%d index:%d",
                 gptr.unitid, gptr.addr_or_offs.offset,
                 gptr.segid,  gptr.flags);
  if (seg_id) {
    uint16_t index = gptr.flags;
    dart_unit_t target_unitid_rel;
    win = dart_win_lists[index];
    DART_LOG_DEBUG("dart_flush_local() win:%"PRIu64" seg:%d unit:%d",
                   (uint64_t)win, seg_id, target_unitid_abs);
    unit_g2l(index, target_unitid_abs, &target_unitid_rel);
    DART_LOG_TRACE("dart_flush_local: MPI_Win_flush_local");
    MPI_Win_flush_local(target_unitid_rel, win);
  } else {
    win = dart_win_local_alloc;
    DART_LOG_DEBUG("dart_flush_local() lwin:%"PRIu64" seg:%d unit:%d",
                   (uint64_t)win, seg_id, target_unitid_abs);
    DART_LOG_TRACE("dart_flush_local: MPI_Win_flush_local");
    MPI_Win_flush_local(target_unitid_abs, win);
  }
  DART_LOG_DEBUG("dart_flush_local > finished");
  return DART_OK;
}
Esempio n. 2
0
int main(int argc, char **argv){
  int i, me, target;
  unsigned int size;
  double t, t_max;
  MPI_Win win;

  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Win_create(&send_buf, sizeof(char)*MAX_SIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win);
  
  target = 1 - me;
  MPI_Win_lock_all(0, win);
  
  init_buf(send_buf, me);

  if(me==0) print_items();

  for(size=1;size<MAX_SIZE+1;size*=2){
    MPI_Barrier(MPI_COMM_WORLD);
    for(i=0;i<LOOP+WARMUP;i++){
      if(WARMUP == i)
	t = wtime();

      if(me == 0){
	MPI_Put(send_buf, size, MPI_CHAR, target, 0, size, MPI_CHAR, win);
	MPI_Win_flush_local(target, win);

	while(send_buf[0] == '0' || send_buf[size-1] == '0'){ MPI_Win_flush(me, win); }
	send_buf[0] = '0'; send_buf[size-1] = '0';
      }
      else {
	while(send_buf[0] == '1' || send_buf[size-1] == '1'){ MPI_Win_flush(me, win); }
	send_buf[0] = '1'; send_buf[size-1] = '1';

	MPI_Put(send_buf, size, MPI_CHAR, target, 0, size, MPI_CHAR, win);
	MPI_Win_flush_local(target, win);
      }
    } //end of LOOP

    t = wtime() - t;
    MPI_Reduce(&t, &t_max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
    if(me == 0)
      print_results(size, t_max);
  }

  MPI_Win_unlock_all(win);
  MPI_Win_free(&win);
  MPI_Finalize();
  return 0;
}
Esempio n. 3
0
void benchmark (long * msg_buffer, int me, int pairs, int nxtpe, MPI_Win win)
{
	static double mr, mr_sum;
	int iters;
	
	if (msg_buffer == NULL) {
		printf("Input buffer is NULL, no reason to proceed\n");
		exit(-1);
	}
	/*
	 * Warmup
	 */
	if (me < pairs) {
		for (int i = 0; i < ITERS_LARGE; i += 1) {
			MPI_Put ((msg_buffer + i*MAX_MSG_SZ), MAX_MSG_SZ, MPI_LONG, nxtpe, i*MAX_MSG_SZ, MAX_MSG_SZ, MPI_LONG, win);
			MPI_Win_flush_local (nxtpe, win);
		}
	}

	MPI_Win_flush_all(win);
	MPI_Barrier(MPI_COMM_WORLD);
	/*
	 * Benchmark
	 */
	for (long size = 1; size <= MAX_MSG_SZ; size <<= 1) {
        	iters = size < LARGE_THRESHOLD ? ITERS_SMALL : ITERS_LARGE;
		mr = message_rate(msg_buffer, size, iters, me, pairs, nxtpe, win);
		MPI_Reduce(&mr, &mr_sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
		print_message_rate(size, mr_sum, me);
	}
}
void ompi_win_flush_local_f(MPI_Fint *rank, MPI_Fint *win, MPI_Fint *ierr)
{
    int c_ierr;
    MPI_Win c_win = MPI_Win_f2c(*win);

    c_ierr = MPI_Win_flush_local(OMPI_FINT_2_INT(*rank), c_win);
    if (NULL != ierr) *ierr = OMPI_INT_2_FINT(c_ierr);
}
static inline
void _wait_gets(const int target_rank, const MPI_Win win)
{
  if(_is_get_blocking){
    XACC_DEBUG("flush_local(%d) for [host|acc]", target_rank);
    MPI_Win_flush_local(target_rank, win);
  }
}
Esempio n. 6
0
long shmemx_ct_get(shmemx_ct_t ct)
{
#ifdef ENABLE_SMP_OPTIMIZATIONS
    if (shmem_world_is_smp) {
        return __sync_fetch_and_add(ct,0);
    } else
#endif
    {
        shmem_offset_t win_offset = (ptrdiff_t)((intptr_t)ct - (intptr_t)shmem_sheap_base_ptr);
        long output;
        MPI_Fetch_and_op(NULL, &output, MPI_LONG, shmem_world_rank, win_offset, MPI_NO_OP, shmem_sheap_win);
        MPI_Win_flush_local(shmem_world_rank, shmem_sheap_win);
        return output;
    }
}
Esempio n. 7
0
/*Run Get_accumulate with flush local*/
void run_get_acc_with_flush_local(int rank, WINDOW type)
{
    int size, i;
    MPI_Aint disp = 0;
    MPI_Win     win;

    for (size = 0; size <= MAX_SIZE; size = (size ? size * 2 : size + 1)) {
        allocate_memory(rank, rbuf, size, type, &win);

        if (type == WIN_DYNAMIC) {
            disp = sdisp_remote;
        }

        if(size > LARGE_MESSAGE_SIZE) {
            loop = LOOP_LARGE;
            skip = SKIP_LARGE;
        }

        if(rank == 0) {
            MPI_CHECK(MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 1, 0, win));
            for (i = 0; i < skip + loop; i++) {
                if (i == skip) {
                    t_start = MPI_Wtime ();
                }
                MPI_CHECK(MPI_Get_accumulate(sbuf, size, MPI_CHAR, cbuf, size, MPI_CHAR, 1, disp, size,
                    MPI_CHAR, MPI_SUM, win));
                MPI_CHECK(MPI_Win_flush_local(1, win));
            }
            t_end = MPI_Wtime ();
            MPI_CHECK(MPI_Win_unlock(1, win));
        }                

        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
        
        print_latency(rank, size);

        MPI_Win_free(&win);
    }
}
static void _mpi_scalar_mget(const int target_rank, 
			     void *dst, const _XMP_coarray_t *src_desc,
			     const size_t dst_offset, const size_t src_offset,
			     const int dst_dims, 
			     const _XMP_array_section_t *dst_info,
			     const bool is_src_on_acc)
{
  char *laddr = (char*)dst + dst_offset;
  char *raddr = get_remote_addr(src_desc, target_rank, is_src_on_acc) + src_offset;
  MPI_Win win = get_window(src_desc, is_src_on_acc);
  size_t transfer_size = src_desc->elmt_size;

  XACC_DEBUG("scalar_mget(local_p=%p, size=%zd, target=%d, remote_p=%p, is_acc=%d)", laddr, transfer_size, target_rank, raddr, is_src_on_acc);

  MPI_Get((void*)laddr, transfer_size, MPI_BYTE, target_rank,
	  (MPI_Aint)raddr, transfer_size, MPI_BYTE,
	  win);

  //we have to wait completion of the get
  XACC_DEBUG("flush_local(%d) for [host|acc]", target_rank);
  MPI_Win_flush_local(target_rank, win);

  _unpack_scalar((char*)dst, dst_dims, laddr, dst_info);
}
Esempio n. 9
0
/*Run FOP with flush local*/
void run_fop_with_flush_local (int rank, WINDOW type)
{
    int i;
    MPI_Win     win;

    MPI_Aint disp = 0;

    MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

    allocate_atomic_memory(rank, sbuf_original, rbuf_original,
                tbuf_original, NULL, (char **)&sbuf, (char **)&rbuf,
                (char **)&tbuf, NULL, (char **)&rbuf,  MAX_MSG_SIZE, type, &win);

    if(rank == 0) {
        if (type == WIN_DYNAMIC) {
            disp = disp_remote;
        }

        MPI_CHECK(MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, win));
        for (i = 0; i < skip + loop; i++) {
            if (i == skip) {
                t_start = MPI_Wtime ();
            }
            MPI_CHECK(MPI_Fetch_and_op(sbuf, tbuf, MPI_LONG_LONG, 1, disp, MPI_SUM, win));
            MPI_CHECK(MPI_Win_flush_local(1, win));
        }
        t_end = MPI_Wtime ();
        MPI_CHECK(MPI_Win_unlock(1, win));
    }                

    MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

    print_latency(rank, 8);

    free_atomic_memory (sbuf, rbuf, tbuf, NULL, win, rank);
}
Esempio n. 10
0
int main(int argc, char *argv[])
{
    int rank, nproc;
    int i;
    MPI_Win win;
    int *tar_buf = NULL;
    int *orig_buf = NULL;
    MPI_Datatype derived_dtp;
    int errors = 0;

    MPI_Init(&argc, &argv);

    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (nproc < 3) {
        fprintf(stderr, "Run this program with at least 3 processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Alloc_mem(sizeof(int) * DATA_SIZE, MPI_INFO_NULL, &orig_buf);
    MPI_Alloc_mem(sizeof(int) * DATA_SIZE, MPI_INFO_NULL, &tar_buf);

    for (i = 0; i < DATA_SIZE; i++) {
        orig_buf[i] = 1;
        tar_buf[i] = 0;
    }

    MPI_Type_vector(COUNT, BLOCKLENGTH - 1, STRIDE, MPI_INT, &derived_dtp);
    MPI_Type_commit(&derived_dtp);

    MPI_Win_create(tar_buf, sizeof(int) * DATA_SIZE, sizeof(int),
                   MPI_INFO_NULL, MPI_COMM_WORLD, &win);

    /***** test between rank 0 and rank 1 *****/

    if (rank == 1) {
        MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win);

        for (i = 0; i < OPS_NUM; i++) {
            MPI_Accumulate(orig_buf, 1, derived_dtp,
                           0, 0, DATA_SIZE - COUNT, MPI_INT, MPI_SUM, win);
            MPI_Win_flush_local(0, win);
        }

        MPI_Win_unlock(0, win);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* check results */
    if (rank == 0) {
        for (i = 0; i < DATA_SIZE - COUNT; i++) {
            if (tar_buf[i] != OPS_NUM) {
                printf("tar_buf[%d] = %d, expected %d\n", i, tar_buf[i], OPS_NUM);
                errors++;
            }
        }
    }

    for (i = 0; i < DATA_SIZE; i++) {
        tar_buf[i] = 0;
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /***** test between rank 0 and rank 2 *****/

    if (rank == 2) {
        MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, win);

        for (i = 0; i < OPS_NUM; i++) {
            MPI_Accumulate(orig_buf, 1, derived_dtp,
                           0, 0, DATA_SIZE - COUNT, MPI_INT, MPI_SUM, win);
            MPI_Win_flush_local(0, win);
        }

        MPI_Win_unlock(0, win);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* check results */
    if (rank == 0) {
        for (i = 0; i < DATA_SIZE - COUNT; i++) {
            if (tar_buf[i] != OPS_NUM) {
                printf("tar_buf[%d] = %d, expected %d\n", i, tar_buf[i], OPS_NUM);
                errors++;
            }
        }

        if (errors == 0)
            printf(" No Errors\n");
    }

    MPI_Win_free(&win);

    MPI_Type_free(&derived_dtp);

    MPI_Free_mem(orig_buf);
    MPI_Free_mem(tar_buf);

    MPI_Finalize();

    return 0;
}
Esempio n. 11
0
/*  garray_get()
 */
int64_t garray_get(garray_t *ga, int64_t *lo, int64_t *hi, void *buf_)
{
    int64_t count = (hi[0] - lo[0]) + 1, length = count * ga->elem_size,
            tlonid, tloidx, thinid, thiidx, tnid, tidx, n, oidx = 0;
    int8_t *buf = (int8_t *)buf_;

    calc_target(ga, lo[0], &tlonid, &tloidx);
    calc_target(ga, hi[0], &thinid, &thiidx);

    /* is all requested data on the same target? */
    if (tlonid == thinid) {
        LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld-%ld, single target %ld.%ld\n",
                  ga->g->nid, lo[0], hi[0], tlonid, tloidx);

        //MPI_Win_lock(MPI_LOCK_SHARED, tlonid, 0, ga->win);
        MPI_Get(buf, length, MPI_INT8_T,
                tlonid, (tloidx * ga->elem_size), length, MPI_INT8_T,
                ga->win);
        //MPI_Win_unlock(tlonid, ga->win);
        MPI_Win_flush_local(tlonid, ga->win);

        return 0;
    }

    /* get the data in the lo nid */
    n = ga->nelems_per_node + (tlonid < ga->nextra_elems ? 1 : 0) - tloidx;

    LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld elements from %ld.%ld\n",
              ga->g->nid, n, tlonid, tloidx);

    //MPI_Win_lock(MPI_LOCK_SHARED, tlonid, 0, ga->win);
    MPI_Get(buf, (n * ga->elem_size), MPI_INT8_T,
            tlonid, (tloidx * ga->elem_size), (n * ga->elem_size), MPI_INT8_T,
            ga->win);
    //MPI_Win_unlock(tlonid, ga->win);

    oidx = (n * ga->elem_size);

    /* get the data in the in-between nids */
    tidx = 0;
    for (tnid = tlonid + 1;  tnid < thinid;  ++tnid) {
        n = ga->nelems_per_node + (tnid < ga->nextra_elems ? 1 : 0);

        LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld elements from %ld.%ld\n",
                  ga->g->nid, n, tnid, tidx);

        //MPI_Win_lock(MPI_LOCK_SHARED, tnid, 0, ga->win);
        MPI_Get(&buf[oidx], (n * ga->elem_size), MPI_INT8_T,
                tnid, 0, (n * ga->elem_size), MPI_INT8_T,
                ga->win);
        //MPI_Win_unlock(tnid, ga->win);

        oidx += (n * ga->elem_size);
    }

    /* get the data in the hi nid */
    n = thiidx + 1;

    LOG_DEBUG(ga->g->glog, "[%d] garray getting %ld elements up to %ld.%ld\n",
              ga->g->nid, n, thinid, thiidx);

    //MPI_Win_lock(MPI_LOCK_SHARED, thinid, 0, ga->win);
    MPI_Get(&buf[oidx], (n * ga->elem_size), MPI_INT8_T,
            thinid, 0, (n * ga->elem_size), MPI_INT8_T,
            ga->win);
    //MPI_Win_unlock(thinid, ga->win);
    MPI_Win_flush_local_all(ga->win);

    return 0;
}
Esempio n. 12
0
int main(int argc, char ** argv)
{
  long Block_order;        /* number of columns owned by rank       */
  long Block_size;         /* size of a single block                */
  long Colblock_size;      /* size of column block                  */
  int Tile_order=32;       /* default Tile order                    */
  int tiling;              /* boolean: true if tiling is used       */
  int Num_procs;           /* number of ranks                       */
  long order;              /* order of overall matrix               */
  int send_to, recv_from;  /* ranks with which to communicate       */
  long bytes;              /* combined size of matrices             */
  int my_ID;               /* rank                                  */
  int root=0;              /* rank of root                          */
  int iterations;          /* number of times to do the transpose   */
  int i, j, it, jt, istart;/* dummies                               */
  int iter;                /* index of iteration                    */
  int phase;               /* phase inside staged communication     */
  int colstart;            /* starting column for owning rank       */
  int error;               /* error flag                            */
  double RESTRICT *A_p;    /* original matrix column block          */
  double RESTRICT *B_p;    /* transposed matrix column block        */
  double RESTRICT *Work_in_p;/* workspace for transpose function    */
  double RESTRICT *Work_out_p;/* workspace for transpose function   */
  double abserr,           /* absolute error                        */
         abserr_tot;       /* aggregate absolute error              */
  double epsilon = 1.e-8;  /* error tolerance                       */
  double local_trans_time, /* timing parameters                     */
         trans_time,
         avgtime;
  MPI_Win  rma_win = MPI_WIN_NULL;
  MPI_Info rma_winfo = MPI_INFO_NULL;
  int passive_target = 0;  /* use passive target RMA sync           */
#if MPI_VERSION >= 3
  int  flush_local  = 1;   /* flush local (or remote) after put     */
  int  flush_bundle = 1;   /* flush every <bundle> put calls        */
#endif

/*********************************************************************
** Initialize the MPI environment
*********************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

/*********************************************************************
** process, test and broadcast input parameters
*********************************************************************/
  error = 0;
  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPIRMA matrix transpose: B = A^T\n");

    if (argc <= 3){
      printf("Usage: %s <# iterations> <matrix order> [Tile size]"
             "[sync (0=fence, 1=flush)] [flush local?] [flush bundle]\n",
             *argv);
      error = 1; goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    if(iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1; goto ENDOFTESTS;
    }

    order = atol(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %ld should at least # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }
    if (order%Num_procs) {
      printf("ERROR: matrix order %ld should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc >= 4) Tile_order     = atoi(*++argv);
    if (argc >= 5) passive_target = atoi(*++argv);
#if MPI_VERSION >= 3
    if (argc >= 6) flush_local    = atoi(*++argv);
    if (argc >= 7) flush_bundle   = atoi(*++argv);
#endif

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Matrix order         = %ld\n", order);
    printf("Number of iterations = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
          printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
    if (passive_target) {
#if MPI_VERSION < 3
        printf("Synchronization      = MPI_Win_(un)lock\n");
#else
        printf("Synchronization      = MPI_Win_flush%s (bundle=%d)\n", flush_local ? "_local" : "", flush_bundle);
#endif
    } else {
        printf("Synchronization      = MPI_Win_fence\n");
    }
  }

  /*  Broadcast input data to all ranks */
  MPI_Bcast (&order,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast (&iterations,     1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&Tile_order,     1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&passive_target, 1, MPI_INT,  root, MPI_COMM_WORLD);
#if MPI_VERSION >= 3
  MPI_Bcast (&flush_local,    1, MPI_INT,  root, MPI_COMM_WORLD);
  MPI_Bcast (&flush_bundle,   1, MPI_INT,  root, MPI_COMM_WORLD);
#endif

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a
** rank.  Each column block is made up of Num_procs smaller square
** blocks of order block_order.
*********************************************************************/

  Block_order    = order/Num_procs;
  colstart       = Block_order * my_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

  /* debug message size effects */
  if (my_ID == root) {
    printf("Block_size           = %ld\n", Block_size);
  }

/*********************************************************************
** Create the column block of the test matrix, the row block of the
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/
  A_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  MPI_Info_create (&rma_winfo);
  MPI_Info_set (rma_winfo, "no locks", "true");
  B_p = (double *)prk_malloc(Colblock_size*sizeof(double));
  if (B_p == NULL){
    printf(" Error allocating space for transpose matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);

  if (Num_procs>1) {
    Work_out_p = (double *) prk_malloc(Block_size*(Num_procs-1)*sizeof(double));
    if (Work_out_p == NULL){
      printf(" Error allocating space for work_out on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);

    PRK_Win_allocate(Block_size*(Num_procs-1)*sizeof(double), sizeof(double),
                     rma_winfo, MPI_COMM_WORLD, &Work_in_p, &rma_win);
    if (Work_in_p == NULL){
      printf(" Error allocating space for work on node %d\n",my_ID);
      error = 1;
    }
    bail_out(error);
  }

#if MPI_VERSION >= 3
  if (passive_target && Num_procs>1) {
    MPI_Win_lock_all(MPI_MODE_NOCHECK,rma_win);
  }
#endif

  /* Fill the original column matrix                                                */
  istart = 0;
  for (j=0;j<Block_order;j++) {
    for (i=0;i<order; i++) {
      A(i,j) = (double) (order*(j+colstart) + i);
      B(i,j) = 0.0;
    }
  }

  MPI_Barrier(MPI_COMM_WORLD);

  for (iter = 0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration                                        */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                     */
    istart = colstart;
    if (!tiling) {
      for (i=0; i<Block_order; i++) {
        for (j=0; j<Block_order; j++) {
          B(j,i) += A(i,j);
          A(i,j) += 1.0;
        }
      }
    } else {
      for (i=0; i<Block_order; i+=Tile_order) {
        for (j=0; j<Block_order; j+=Tile_order) {
          for (it=i; it<MIN(Block_order,i+Tile_order); it++) {
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) += A(it,jt);
              A(it,jt) += 1.0;
            }
          }
        }
      }
    }

    if (!passive_target && Num_procs>1) {
      MPI_Win_fence(MPI_MODE_NOSTORE | MPI_MODE_NOPRECEDE, rma_win);
    }

    for (phase=1; phase<Num_procs; phase++){
      send_to = (my_ID - phase + Num_procs)%Num_procs;

      istart = send_to*Block_order;
      if (!tiling) {
        for (i=0; i<Block_order; i++) {
          for (j=0; j<Block_order; j++) {
            Work_out(phase-1,j,i) = A(i,j);
            A(i,j) += 1.0;
          }
        }
      } else {
        for (i=0; i<Block_order; i+=Tile_order) {
          for (j=0; j<Block_order; j+=Tile_order) {
            for (it=i; it<MIN(Block_order,i+Tile_order); it++) {
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(phase-1,jt,it) = A(it,jt);
                A(it,jt) += 1.0;
              }
            }
          }
        }
      }

#if MPI_VERSION < 3
      if (passive_target) {
          MPI_Win_lock(MPI_LOCK_SHARED, send_to, MPI_MODE_NOCHECK, rma_win);
      }
#endif
      MPI_Put(Work_out_p+Block_size*(phase-1), Block_size, MPI_DOUBLE, send_to,
              Block_size*(phase-1), Block_size, MPI_DOUBLE, rma_win);

      if (passive_target) {
#if MPI_VERSION < 3
        MPI_Win_unlock(send_to, rma_win);
#else
        if (flush_bundle==1) {
          if (flush_local==1) {
              MPI_Win_flush_local(send_to, rma_win);
          } else {
              MPI_Win_flush(send_to, rma_win);
          }
        } else if ( (phase%flush_bundle) == 0) {
          /* Too lazy to record all targets, so let MPI do it internally (hopefully) */
          if (flush_local==1) {
              MPI_Win_flush_local_all(rma_win);
          } else {
              MPI_Win_flush_all(rma_win);
          }
        }
#endif
      }
    }  /* end of phase loop for puts  */
    if (Num_procs>1) {
      if (passive_target) {
#if MPI_VERSION >= 3
          MPI_Win_flush_all(rma_win);
#endif
          MPI_Barrier(MPI_COMM_WORLD);
      } else {
          MPI_Win_fence(MPI_MODE_NOSTORE, rma_win);
      }
    }

    for (phase=1; phase<Num_procs; phase++) {
      recv_from = (my_ID + phase)%Num_procs;
      istart = recv_from*Block_order;
      /* scatter received block to transposed matrix; no need to tile */
      for (j=0; j<Block_order; j++) {
        for (i=0; i<Block_order; i++) {
          B(i,j) += Work_in(phase-1,i,j);
        }
      }
    } /* end of phase loop for scatters */

    /* for the flush case we need to make sure we have consumed Work_in
       before overwriting it in the next iteration                    */
    if (Num_procs>1 && passive_target) {
      MPI_Barrier(MPI_COMM_WORLD);
    }

  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  double addit = ((double)(iterations+1) * (double) (iterations))/2.0;
  for (j=0;j<Block_order;j++) {
    for (i=0;i<order; i++) {
      abserr += ABS(B(i,j) - ((double)(order*i + j+colstart)*(iterations+1)+addit));
    }
  }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
    }
    else {
      printf("ERROR: Aggregate absolute error %lf exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  if (rma_win!=MPI_WIN_NULL) {
#if MPI_VERSION >=3
    if (passive_target) {
      MPI_Win_unlock_all(rma_win);
    }
#endif
    PRK_Win_free(&rma_win);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
Esempio n. 13
0
int main(int argc, char *argv[]) {
  MPI_Win counter, table;
  char commands[MAX_COMMANDS][MAX_COMMAND_LEN] = {{0}};
  int rank, comm_size, i;

  // setenv ("MPICH_ASYNC_PROGRESS", "1", 0);
  MPI_Init (&argc, &argv);

  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Comm_size (MPI_COMM_WORLD, &comm_size);

  MPI_Win_create (&commands, MAX_COMMANDS * MAX_COMMAND_LEN, MAX_COMMAND_LEN,
                 MPI_INFO_NULL, MPI_COMM_WORLD, &table);

  MPI_Win_fence (0, table);

  // Distribute command lines to tasks, round-robin, start from task 1

  i = 0;
  if (rank == 0) {
    char line[MAX_COMMAND_LEN + 2];
    MPI_Win_lock_all (MPI_MODE_NOCHECK, table);
    while (fgets (line, MAX_COMMAND_LEN + 2, stdin) != NULL) {
      if (i >= MAX_COMMANDS * comm_size) {
        fprintf (stderr, "MAX_COMMANDS * comm_size (%d) exceeded.\n", i);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      if (strlen (line) > MAX_COMMAND_LEN) {
        fprintf (stderr, "MAX_COMMAND_LEN exceeded, line %d: %s\n",
                 i, line);
        MPI_Abort(MPI_COMM_WORLD, 1);
      }
      MPI_Aint disp = (i / comm_size);
      int target_rank = (i + 1) % comm_size;
      MPI_Put (line, MAX_COMMAND_LEN, MPI_CHAR, target_rank, disp,
               MAX_COMMAND_LEN, MPI_CHAR, table);
      MPI_Win_flush_local (target_rank, table);
      i++;
    }
    MPI_Win_unlock_all (table);
  }

  MPI_Win_fence (0, table);

  // Initialize next_command counter/pointer to the top of the command
  // line stack.

  int next_command;

  MPI_Win_create (&next_command, sizeof(int), sizeof(int),
                  MPI_INFO_NULL, MPI_COMM_WORLD, &counter);

  for (i = MAX_COMMANDS - 1; i >= 0; i--) {
    if (commands[i][0]) {
      next_command = i;
      break;
    }
  }

  MPI_Barrier (MPI_COMM_WORLD);

  // Execute command lines
  //
  // Process commands from own rank + steal_increment

  const int dec = -1;
  int steal_increment = 0;
  int current_command;
  while (steal_increment < comm_size) {
    int current_rank = (rank + steal_increment) % comm_size;
    MPI_Win_lock (MPI_LOCK_SHARED, current_rank, 0, counter);
    MPI_Fetch_and_op (&dec, &current_command, MPI_INT, current_rank,
                      0, MPI_SUM, counter);
    MPI_Win_unlock (current_rank, counter);
    if (current_command < 0) {
      steal_increment++;
    } else {
      char command[MAX_COMMAND_LEN] = {0};
      MPI_Win_lock (MPI_LOCK_SHARED, current_rank, MPI_MODE_NOCHECK, table);
      MPI_Get (&command, MAX_COMMAND_LEN, MPI_CHAR, current_rank,
               current_command, MAX_COMMAND_LEN, MPI_CHAR, table);
      MPI_Win_unlock (current_rank, table);
      system (command);
    }
  }

  MPI_Win_free (&counter);
  MPI_Win_free (&table);
  MPI_Barrier (MPI_COMM_WORLD);
  MPI_Finalize ();
  exit (0);
}
Esempio n. 14
0
FORT_DLL_SPEC void FORT_CALL mpi_win_flush_local_ ( MPI_Fint *v1, MPI_Fint *v2, MPI_Fint *ierr ){
    *ierr = MPI_Win_flush_local( (int)*v1, (MPI_Win)*v2 );
}