コード例 #1
0
int main(int argc, char* argv[])
{
    MPI_Init(&argc,&argv);
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &MPI_COMM_NODE);

    int n = (argc>1) ? atoi(argv[1]) : 1000;

    int wrank, wsize;
    MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
    MPI_Comm_size(MPI_COMM_WORLD, &wsize);

    int nrank, nsize;
    MPI_Comm_rank(MPI_COMM_WORLD, &nrank);
    MPI_Comm_size(MPI_COMM_WORLD, &nsize);

    char * buf1 = NULL;
    char * buf2 = NULL;
    MPI_Alloc_mem(n, MPI_INFO_NULL, &buf1);
    MPI_Alloc_mem(n, MPI_INFO_NULL, &buf2);

    memset(buf1, nrank==0 ? 'Z' : 'A', n);
    memset(buf2, nrank==0 ? 'Z' : 'A', n);

    double t0, t1, dt;
    for (int r=0; r<20; r++) {
        MPI_Barrier(MPI_COMM_WORLD);
        t0 = MPI_Wtime();
        MPI_Bcast(buf1, n, MPI_CHAR, 0, MPI_COMM_NODE);
        t1 = MPI_Wtime();
        dt = t1-t0;
        printf("%d: MPI_Bcast: %lf seconds, %lf MB/s \n", wrank, dt, n*(1.e-6/dt));
        fflush(stdout);

        MPI_Barrier(MPI_COMM_WORLD);
        t0 = MPI_Wtime();
        SMP_Bcast(buf2, n, MPI_CHAR, 0, MPI_COMM_NODE);
        t1 = MPI_Wtime();
        dt = t1-t0;
        printf("%d: SMP_Bcast: %lf seconds, %lf MB/s \n", wrank, dt, n*(1.e-6/dt));
        fflush(stdout);

        if (r==0) {
            char * tmp = malloc(n);
            memset(tmp, 'Z', n);
            int err1 = memcmp(tmp, buf1, n);
            int err2 = memcmp(tmp, buf2, n);
            if (err1>0 || err2>0) {
                printf("%d: errors: MPI (%d), SMP (%d) \n", wrank, err1, err2);
            }
        }
    }

    MPI_Free_mem(buf1);
    MPI_Free_mem(buf2);

    MPI_Comm_free(&MPI_COMM_NODE);

    MPI_Finalize();
    return 0;
}
コード例 #2
0
ファイル: MashmIntraNodeComm.c プロジェクト: NCAR/MASHM
int MashmIntraNodeCommInit(MashmIntraNodeComm* intraComm, MPI_Comm in_comm) {
  int ierr;

  /* Store the parent commincator */
  intraComm->parentComm = in_comm;

  /* Create the shared memory subcommunicator groups */
  ierr = MPI_Comm_split_type(intraComm->parentComm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &(intraComm->comm));

  /* Determine the number of nodal comm groups */

  ierr = MPI_Comm_size(intraComm->comm, &(intraComm->size));
  ierr = MPI_Comm_rank(intraComm->comm, &(intraComm->rank));
  ierr = MPI_Comm_rank(intraComm->parentComm, &(intraComm->parentRank));

  /* Subcommunicator rank == 0 is the master process */
  if (intraComm->rank == 0) {
    intraComm->isMasterProc = true;
  }
  else {
    intraComm->isMasterProc = false;
  }

  intraComm->parentRanksOnNode = (int *) malloc(sizeof(int)*intraComm->size);

  ierr = MPI_Allgather(&(intraComm->parentRank), 1, MPI_INT, 
                       intraComm->parentRanksOnNode, 1, MPI_INT, intraComm->comm);

  return 0;
}
コード例 #3
0
void
MemoryUsageReporter::sharedMemoryRanksBySplitCommunicator()
{
  // figure out which ranks share memory
  processor_id_type world_rank = 0;
#ifdef LIBMESH_HAVE_MPI
  // create a split communicator among shared memory ranks
  MPI_Comm shmem_raw_comm;
  MPI_Comm_split_type(
      _mur_communicator.get(), MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmem_raw_comm);
  Parallel::Communicator shmem_comm(shmem_raw_comm);

  // broadcast the world rank of the sub group root
  world_rank = _my_rank;
  shmem_comm.broadcast(world_rank, 0);
#endif
  std::vector<processor_id_type> world_ranks(_nrank);
  _mur_communicator.gather(0, world_rank, world_ranks);

  // assign a contiguous unique numerical id to each shared memory group on processor zero
  unsigned int id = 0;
  processor_id_type last = world_ranks[0];
  if (_my_rank == 0)
    for (std::size_t i = 0; i < _nrank; ++i)
    {
      if (world_ranks[i] != last)
      {
        last = world_ranks[i];
        id++;
      }
      _hardware_id[i] = id;
    }
}
コード例 #4
0
ファイル: win_alloc_shared.c プロジェクト: jeffhammond/oshmpi
int main(int argc, char* argv[])
{
    MPI_Init(&argc,&argv);

    MPI_Aint bytes = (argc>1) ? atol(argv[1]) : 128*1024*1024;
    printf("bytes = %zu\n", bytes);

    MPI_Comm comm_shared = MPI_COMM_NULL;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &comm_shared);

    MPI_Info info_win = MPI_INFO_NULL;
    MPI_Info_create(&info_win);
    MPI_Info_set(info_win, "alloc_shared_noncontig", "true");

    MPI_Win win_shared = MPI_WIN_NULL;
    void * base_ptr = NULL;
    int rc = MPI_Win_allocate_shared(bytes, 1 /* disp_unit */, info_win, comm_shared, &base_ptr, &win_shared);

    memset(base_ptr,255,bytes);

    MPI_Info_free(&info_win);

    MPI_Comm_free(&comm_shared);
    MPI_Finalize();
    return 0;
}
コード例 #5
0
int main(int argc, char* argv[])
{
    MPI_Comm comm, newcomm, scomm;
    MPI_Group group;
    MPI_Info newinfo;
    int rank, size, color;
    int errs = 0, errclass, mpi_errno;

    MTest_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_dup(MPI_COMM_WORLD, &comm);
    MPI_Comm_group(comm, &group);

    MPI_Comm_create(comm, group, &newcomm);
    color = rank % 2;
    MPI_Comm_split(MPI_COMM_WORLD, color, rank, &scomm);
    MPI_Errhandler_set(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

    /*test comm_split_type for NULL variable */
    mpi_errno = MPI_Comm_split_type(scomm, 2, 4, newinfo, NULL);
    MPI_Error_class(mpi_errno, &errclass);
    if (errclass != MPI_ERR_ARG)
        ++errs;

    MPI_Comm_free(&comm);
    MPI_Comm_free(&newcomm);
    MPI_Comm_free(&scomm);
    MPI_Group_free(&group);
    MTest_Finalize(errs);
    MPI_Finalize();
    return 0;
}
コード例 #6
0
ファイル: DeviceSelectMPI.cpp プロジェクト: lssfau/walberla
void selectDeviceBasedOnMpiRank()
{
#ifdef WALBERLA_BUILD_WITH_MPI
   int deviceCount;
   WALBERLA_CUDA_CHECK( cudaGetDeviceCount( &deviceCount ));
   WALBERLA_LOG_INFO_ON_ROOT( "Selecting CUDA device depending on MPI Rank" );

   MPI_Info info;
   MPI_Info_create( &info );
   MPI_Comm newCommunicator;
   MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, info, &newCommunicator );

   int processesOnNode;
   int rankOnNode;
   MPI_Comm_size( newCommunicator, &processesOnNode );
   MPI_Comm_rank( newCommunicator, &rankOnNode );

   if ( deviceCount == processesOnNode )
   {
      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
   }
   else if ( deviceCount > processesOnNode )
   {
      WALBERLA_LOG_WARNING( "Not using all available GPUs on node. Processes on node "
                               << processesOnNode << " available GPUs on node " << deviceCount );
      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode ));
   }
   else
   {
      WALBERLA_LOG_WARNING( "Too many processes started per node - should be one per GPU. Number of processes per node "
                               << processesOnNode << ", available GPUs on node " << deviceCount );
      WALBERLA_CUDA_CHECK( cudaSetDevice( rankOnNode % deviceCount ));
   }
#endif
}
コード例 #7
0
ファイル: test-mpi.c プロジェクト: mihaic/icc-travis
int main(int argc, char *argv[])
{
    int requested=MPI_THREAD_MULTIPLE, provided;
    MPI_Init_thread(&argc, &argv, requested, &provided);
    if (requested<provided) MPI_Abort(MPI_COMM_WORLD, 1);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int sum = 0;
    MPI_Allreduce(&rank, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    if (sum != (size*(size-1)/2) ) MPI_Abort(MPI_COMM_WORLD,2);

    MPI_Comm nodecomm = MPI_COMM_NULL;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &nodecomm);

    int bytes = 1024;
    MPI_Win nodewin = MPI_WIN_NULL;
    void * nodeptr = NULL;
    MPI_Win_allocate_shared((MPI_Aint)bytes, 1, MPI_INFO_NULL, nodecomm, &nodeptr, &nodewin);

    int noderank;
    MPI_Comm_rank(nodecomm, &noderank);
    if (noderank==0) memset(nodeptr,1,(size_t)bytes);

    MPI_Win_free(&nodewin);
    MPI_Comm_free(&nodecomm);

    if (rank==0) printf("Success\n");

    MPI_Finalize();
    return 0;
}
コード例 #8
0
ファイル: mpi_shmem.hpp プロジェクト: hpc4cmb/toast
            mpi_shmem ( MPI_Comm comm=MPI_COMM_WORLD ) : comm_( comm ) {

                // Split the provided communicator into groups that share
                // memory (are on the same node).

                if ( MPI_Comm_split_type( comm, MPI_COMM_TYPE_SHARED, 0,
                                          MPI_INFO_NULL, &shmcomm_ ) )
                    throw std::runtime_error( "Failed to split communicator by node." );

                if ( MPI_Comm_size( shmcomm_, &ntasks_ ) )
                    throw std::runtime_error( "Failed to get node communicator size" );

                if ( MPI_Comm_rank( shmcomm_, &rank_ ) )
                    throw std::runtime_error( "Failed to get node communicator rank" );
            }
コード例 #9
0
int main(int argc, char * argv[])
{
    MPI_Init(&argc, &argv);

    int wrank, wsize;
    MPI_Comm_rank(MPI_COMM_WORLD, &wrank);
    MPI_Comm_size(MPI_COMM_WORLD, &wsize);

    int nrank, nsize;
    MPI_Comm MPI_COMM_NODE;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_NODE);
    MPI_Comm_rank(MPI_COMM_NODE, &nrank);
    MPI_Comm_size(MPI_COMM_NODE, &nsize);

    int *   shptr = NULL;
    MPI_Win shwin;
    MPI_Info win_info;
    MPI_Info_create(&win_info);
    MPI_Info_set(win_info, "alloc_shared_noncontig", "true");
    MPI_Win_allocate_shared(sizeof(int), sizeof(int), win_info, MPI_COMM_NODE, &shptr, &shwin);
    MPI_Info_free(&win_info);

    MPI_Win_lock_all(0 /* assertion */, shwin);

    MPI_Win_sync(shwin);
    MPI_Barrier(MPI_COMM_NODE);

    MPI_Aint rsize[nsize];
    int rdisp[nsize];
    int * rptr[nsize];
    for (int i=0; i<nsize; i++) {
        MPI_Win_shared_query(shwin, i, &(rsize[i]), &(rdisp[i]), &(rptr[i]));
        printf("rank=%d target=%d rptr=%p rsize=%zu rdisp=%d \n", nrank, i, rptr[i], (size_t)rsize[i], rdisp[i]);
    }

    MPI_Win_unlock_all(shwin);

    MPI_Win_free(&shwin);

    MPI_Comm_free(&MPI_COMM_NODE);
    MPI_Finalize();

    return 0;
}
コード例 #10
0
ファイル: stencil_mpi_shmem.c プロジェクト: paulie88/skola
int main(int argc, char **argv)
{
    int r,p;
    int n, energy, niters, px, py;

    int rx, ry;
    int north, south, west, east;
    int bx, by, offx, offy;

    /* three heat sources */
    const int nsources = 3;
    int sources[nsources][2];
    int locnsources;             /* number of sources in my area */
    int locsources[nsources][2]; /* sources local to my rank */

    double t1, t2;

    int iter, i, j;

    double heat, rheat;

    int final_flag;

    /* initialize MPI envrionment */
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &r);
    MPI_Comm_size(MPI_COMM_WORLD, &p);

    /* create shared memory communicator */
    MPI_Comm shmcomm;
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);

    int sr, sp; // rank and size in shmem comm
    MPI_Comm_size(shmcomm, &sp);
    MPI_Comm_rank(shmcomm, &sr);

    // this code works only on comm world!
    if(sp != p) MPI_Abort(MPI_COMM_WORLD, 1);

    /* argument checking and setting */
    setup(r, p, argc, argv,
          &n, &energy, &niters, &px, &py, &final_flag);

    if (final_flag == 1) {
        MPI_Finalize();
        exit(0);
    }

    /* determine my coordinates (x,y) -- r=x*a+y in the 2d processor array */
    rx = r % px;
    ry = r / px;

    /* determine my four neighbors */
    north = (ry - 1) * px + rx; if (ry-1 < 0)   north = MPI_PROC_NULL;
    south = (ry + 1) * px + rx; if (ry+1 >= py) south = MPI_PROC_NULL;
    west = ry * px + rx - 1;    if (rx-1 < 0)   west = MPI_PROC_NULL;
    east = ry * px + rx + 1;    if (rx+1 >= px) east = MPI_PROC_NULL;

    /* decompose the domain */
    bx = n / px;    /* block size in x */
    by = n / py;    /* block size in y */
    offx = rx * bx; /* offset in x */
    offy = ry * by; /* offset in y */

    /* printf("%i (%i,%i) - w: %i, e: %i, n: %i, s: %i\n", r, ry,rx,west,east,north,south); */

    int size = (bx+2)*(by+2); /* process-local grid (including halos (thus +2)) */
    double *mem;
    MPI_Win win;
    MPI_Win_allocate_shared(2*size*sizeof(double), 1, MPI_INFO_NULL, shmcomm, &mem, &win);

    double *tmp;
    double *anew=mem; /* each rank's offset */
    double *aold=mem+size; /* second half is aold! */

    double *northptr, *southptr, *eastptr, *westptr;
    double *northptr2, *southptr2, *eastptr2, *westptr2;
    MPI_Aint sz;
    int dsp_unit;
    /* locate the shared memory region for each neighbor */
    MPI_Win_shared_query(win, north, &sz, &dsp_unit, &northptr);
    MPI_Win_shared_query(win, south, &sz, &dsp_unit, &southptr);
    MPI_Win_shared_query(win, east, &sz, &dsp_unit, &eastptr);
    MPI_Win_shared_query(win, west, &sz, &dsp_unit, &westptr);
    northptr2 = northptr+size;
    southptr2 = southptr+size;
    eastptr2 = eastptr+size;
    westptr2 = westptr+size;

    /* initialize three heat sources */
    init_sources(bx, by, offx, offy, n,
                 nsources, sources, &locnsources, locsources);

    t1 = MPI_Wtime(); /* take time */

    MPI_Win_lock_all(0, win);    
    for (iter = 0; iter < niters; ++iter) {
        /* refresh heat sources */
        for (i = 0; i < locnsources; ++i) {
            aold[ind(locsources[i][0],locsources[i][1])] += energy; /* heat source */
        }

	MPI_Win_sync(win);
	MPI_Barrier(shmcomm);

	/* exchange data with neighbors */
	if(north != MPI_PROC_NULL) {
	  for(i=0; i<bx; ++i) aold[ind(i+1,0)] = northptr2[ind(i+1,by)]; /* pack loop - last valid region */
	}
	if(south != MPI_PROC_NULL) {
	  for(i=0; i<bx; ++i) aold[ind(i+1,by+1)] = southptr2[ind(i+1,1)]; /* pack loop */
	}
	if(east != MPI_PROC_NULL) {
	  for(i=0; i<by; ++i) aold[ind(bx+1,i+1)] = eastptr2[ind(1,i+1)]; /* pack loop */
	}
	if(west != MPI_PROC_NULL) {
	  for(i=0; i<by; ++i) aold[ind(0,i+1)] = westptr2[ind(bx,i+1)]; /* pack loop */
	}

        /* update grid points */
        update_grid(bx, by, aold, anew, &heat);

        /* swap working arrays */
        tmp = anew; anew = aold; aold = tmp;

        /* optional - print image */
        if (iter == niters-1) printarr_par(iter, anew, n, px, py, rx, ry, bx, by, offx, offy, shmcomm);
    }

    MPI_Win_unlock_all(win);
    t2 = MPI_Wtime();

    /* get final heat in the system */
    MPI_Allreduce(&heat, &rheat, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
    if (!r) printf("[%i] last heat: %f time: %f\n", r, rheat, t2-t1);

    /* free working arrays and communication buffers */
    MPI_Win_free(&win);
    MPI_Comm_free(&shmcomm);

    MPI_Finalize();
}
コード例 #11
0
FORT_DLL_SPEC void FORT_CALL mpi_comm_split_type_ ( MPI_Fint *v1, MPI_Fint *v2, MPI_Fint *v3, MPI_Fint *v4, MPI_Fint *v5, MPI_Fint *ierr ){
    *ierr = MPI_Comm_split_type( (MPI_Comm)(*v1), (int)*v2, (int)*v3, (MPI_Info)(*v4), (MPI_Comm *)(v5) );
}
コード例 #12
0
int main(int argc, char *argv[])
{
    int rank, nproc, i, x;
    int errors = 0, all_errors = 0;
    MPI_Win win = MPI_WIN_NULL;

    MPI_Comm shm_comm = MPI_COMM_NULL;
    int shm_nproc, shm_rank;
    double **shm_bases = NULL, *my_base;
    MPI_Win shm_win = MPI_WIN_NULL;
    MPI_Group shm_group = MPI_GROUP_NULL, world_group = MPI_GROUP_NULL;
    int *shm_ranks = NULL, *shm_ranks_in_world = NULL;
    MPI_Aint get_target_base_offsets = 0;

    int win_size = sizeof(double) * BUF_CNT;
    int new_win_size = win_size;
    int win_unit = sizeof(double);
    int shm_root_rank_in_world;
    int origin = -1, put_target, get_target;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm_group(MPI_COMM_WORLD, &world_group);

    if (nproc != 4) {
        if (rank == 0)
            printf("Error: must be run with four processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);
    MPI_Comm_rank(shm_comm, &shm_rank);
    MPI_Comm_size(shm_comm, &shm_nproc);
    MPI_Comm_group(shm_comm, &shm_group);

    /* Platform does not support shared memory or wrong host file, just return. */
    if (shm_nproc != 2) {
        goto exit;
    }

    shm_bases = (double **) calloc(shm_nproc, sizeof(double *));
    shm_ranks = (int *) calloc(shm_nproc, sizeof(int));
    shm_ranks_in_world = (int *) calloc(shm_nproc, sizeof(int));

    if (shm_rank == 0)
        shm_root_rank_in_world = rank;
    MPI_Bcast(&shm_root_rank_in_world, 1, MPI_INT, 0, shm_comm);

    /* Identify ranks of target processes which are located on node 0 */
    if (rank == 0) {
        for (i = 0; i < shm_nproc; i++) {
            shm_ranks[i] = i;
        }
        MPI_Group_translate_ranks(shm_group, shm_nproc, shm_ranks, world_group, shm_ranks_in_world);
    }
    MPI_Bcast(shm_ranks_in_world, shm_nproc, MPI_INT, 0, MPI_COMM_WORLD);

    put_target = shm_ranks_in_world[shm_nproc - 1];
    get_target = shm_ranks_in_world[0];

    /* Identify the rank of origin process which are located on node 1 */
    if (shm_root_rank_in_world == 1 && shm_rank == 0) {
        origin = rank;
        if (verbose) {
            printf("----   I am origin = %d, get_target = %d, put_target = %d\n",
                   origin, get_target, put_target);
        }
    }

    /* Allocate shared memory among local processes */
    MPI_Win_allocate_shared(win_size, win_unit, MPI_INFO_NULL, shm_comm, &my_base, &shm_win);

    if (shm_root_rank_in_world == 0 && verbose) {
        MPI_Aint size;
        int disp_unit;
        for (i = 0; i < shm_nproc; i++) {
            MPI_Win_shared_query(shm_win, i, &size, &disp_unit, &shm_bases[i]);
            printf("%d --    shared query: base[%d]=%p, size %zd, "
                   "unit %d\n", rank, i, shm_bases[i], size, disp_unit);
        }
    }

    /* Get offset of put target(1) on get target(0) */
    get_target_base_offsets = (shm_nproc - 1) * win_size / win_unit;

    if (origin == rank && verbose)
        printf("%d --    base_offset of put_target %d on get_target %d: %zd\n",
               rank, put_target, get_target, get_target_base_offsets);

    /* Create using MPI_Win_create(). Note that new window size of get_target(0)
     * is equal to the total size of shm segments on this node, thus get_target
     * process can read the byte located on put_target process.*/
    for (i = 0; i < BUF_CNT; i++) {
        local_buf[i] = (i + 1) * 1.0;
        my_base[i] = 0.0;
    }

    if (get_target == rank)
        new_win_size = win_size * shm_nproc;

    MPI_Win_create(my_base, new_win_size, win_unit, MPI_INFO_NULL, MPI_COMM_WORLD, &win);

    if (verbose)
        printf("%d --    new window my_base %p, size %d\n", rank, my_base, new_win_size);

    MPI_Barrier(MPI_COMM_WORLD);

    /* Check if flush guarantees the completion of put operations on target side.
     *
     * P exclusively locks 2 processes whose windows are shared with each other.
     * P first put and flush to a process, then get the updated data from another process.
     * If flush returns before operations are done on the target side, the data may be
     * incorrect.*/
    for (x = 0; x < ITER; x++) {
        for (i = 0; i < BUF_CNT; i++) {
            local_buf[i] += x;
            check_buf[i] = 0;
        }

        if (rank == origin) {
            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, put_target, 0, win);
            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, get_target, 0, win);

            for (i = 0; i < BUF_CNT; i++) {
                MPI_Put(&local_buf[i], 1, MPI_DOUBLE, put_target, i, 1, MPI_DOUBLE, win);
            }
            MPI_Win_flush(put_target, win);

            MPI_Get(check_buf, BUF_CNT, MPI_DOUBLE, get_target,
                    get_target_base_offsets, BUF_CNT, MPI_DOUBLE, win);
            MPI_Win_flush(get_target, win);

            for (i = 0; i < BUF_CNT; i++) {
                if (check_buf[i] != local_buf[i]) {
                    printf("%d(iter %d) - Got check_buf[%d] = %.1lf, expected %.1lf\n",
                           rank, x, i, check_buf[i], local_buf[i]);
                    errors++;
                }
            }

            MPI_Win_unlock(put_target, win);
            MPI_Win_unlock(get_target, win);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

  exit:

    if (rank == 0 && all_errors == 0)
        printf(" No Errors\n");

    if (shm_bases)
        free(shm_bases);
    if (shm_ranks)
        free(shm_ranks);
    if (shm_ranks_in_world)
        free(shm_ranks_in_world);

    if (shm_win != MPI_WIN_NULL)
        MPI_Win_free(&shm_win);

    if (win != MPI_WIN_NULL)
        MPI_Win_free(&win);

    if (shm_comm != MPI_COMM_NULL)
        MPI_Comm_free(&shm_comm);

    if (shm_group != MPI_GROUP_NULL)
        MPI_Group_free(&shm_group);

    if (world_group != MPI_GROUP_NULL)
        MPI_Group_free(&world_group);

    MPI_Finalize();

    return 0;
}
コード例 #13
0
ファイル: win_info.c プロジェクト: NexMirror/MPICH
int main(int argc, char **argv)
{
    MPI_Info info_in, info_out;
    int errors = 0, all_errors = 0;
    MPI_Win win;
    void *base;
    char invalid_key[] = "invalid_test_key";
    char buf[MPI_MAX_INFO_VAL];
    int flag;
    MPI_Comm shm_comm = MPI_COMM_NULL;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    /* Test#1: setting a valid key at window-create time */

    MPI_Info_create(&info_in);
    MPI_Info_set(info_in, "no_locks", "true");

    MPI_Win_allocate(sizeof(int), sizeof(int), info_in, MPI_COMM_WORLD, &base, &win);
    errors += check_win_info_get(win, "no_locks", "true");

    MPI_Info_free(&info_in);

    /* We create a new window with no info argument for the next text to ensure that we have the
     * default settings */

    MPI_Win_free(&win);
    MPI_Win_allocate(sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &base, &win);

    /* Test#2: setting and getting invalid key */

    win_info_set(win, invalid_key, "true");

    MPI_Win_get_info(win, &info_out);
    MPI_Info_get(info_out, invalid_key, MPI_MAX_INFO_VAL, buf, &flag);
#ifndef USE_STRICT_MPI
    /* Check if our invalid key was ignored.  Note, this check's MPICH's
     * behavior, but this behavior may not be required for a standard
     * conforming MPI implementation. */
    if (flag) {
        printf("%d: %s was not ignored\n", rank, invalid_key);
        errors++;
    }
#endif

    MPI_Info_free(&info_out);


    /* Test#3: setting info key "no_lock" (no default value) */
    win_info_set(win, "no_locks", "false");
    errors += check_win_info_get(win, "no_locks", "false");

    win_info_set(win, "no_locks", "true");
    errors += check_win_info_get(win, "no_locks", "true");


    /* Test#4: getting/setting "accumulate_ordering" */
    /*   #4.1: is the default "rar,raw,war,waw" as stated in the standard? */
    errors += check_win_info_get(win, "accumulate_ordering", "rar,raw,war,waw");

    /*   #4.2: setting "accumulate_ordering" to "none" */
    win_info_set(win, "accumulate_ordering", "none");
    errors += check_win_info_get(win, "accumulate_ordering", "none");

    /*   #4.3: setting "accumulate_ordering" to "rar,waw" */
    win_info_set(win, "accumulate_ordering", "rar,waw");
    errors += check_win_info_get(win, "accumulate_ordering", "rar,waw");


    /* Test#5: getting/setting "accumulate_ops" */
    /*   #5.1: is the default "same_op_no_op" as stated in the standard? */
    errors += check_win_info_get(win, "accumulate_ops", "same_op_no_op");

    /*   #5.2: setting "accumulate_ops" to "same_op" */
    win_info_set(win, "accumulate_ops", "same_op");
    errors += check_win_info_get(win, "accumulate_ops", "same_op");


    /* Test#6: setting "same_size" (no default value) */
    win_info_set(win, "same_size", "false");
    errors += check_win_info_get(win, "same_size", "false");

    win_info_set(win, "same_size", "true");
    errors += check_win_info_get(win, "same_size", "true");


    /* Test#7: setting "same_disp_unit" (no default value) */
    win_info_set(win, "same_disp_unit", "false");
    errors += check_win_info_get(win, "same_disp_unit", "false");

    win_info_set(win, "same_disp_unit", "true");
    errors += check_win_info_get(win, "same_disp_unit", "true");

    /* TODO: check alloc_shm as implementation-specific test */

    /* Test#8: setting "alloc_shared_noncontig" (no default value) in shared window. */
    MPI_Win_free(&win);

    /*   #8.1: setting at window allocation */
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);

    MPI_Info_create(&info_in);
    MPI_Info_set(info_in, "alloc_shared_noncontig", "true");
    MPI_Win_allocate_shared(sizeof(int), sizeof(int), info_in, shm_comm, &base, &win);
    errors += check_win_info_get(win, "alloc_shared_noncontig", "true");
    MPI_Info_free(&info_in);

    /*   #8.2: setting info */
    win_info_set(win, "alloc_shared_noncontig", "false");
    errors += check_win_info_get(win, "alloc_shared_noncontig", "false");
    MPI_Comm_free(&shm_comm);

    MPI_Win_free(&win);

    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0 && all_errors == 0)
        printf(" No Errors\n");

    MPI_Finalize();

    return 0;
}
コード例 #14
0
ファイル: win_shared.c プロジェクト: glesserd/simgrid
int main(int argc, char **argv) {
    int      i, j, rank, nproc;
    int      shm_rank, shm_nproc;
    MPI_Aint size;
    int      errors = 0, all_errors = 0;
    int     *base, *my_base;
    int      disp_unit;
    MPI_Win  shm_win;
    MPI_Comm shm_comm;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);

    MPI_Comm_rank(shm_comm, &shm_rank);
    MPI_Comm_size(shm_comm, &shm_nproc);

    /* Allocate ELEM_PER_PROC integers for each process */
    MPI_Win_allocate_shared(sizeof(int)*ELEM_PER_PROC, sizeof(int), MPI_INFO_NULL,
                            shm_comm, &my_base, &shm_win);

    /* Locate absolute base */
    MPI_Win_shared_query(shm_win, MPI_PROC_NULL, &size, &disp_unit, &base);

    /* make sure the query returned the right values */
    if (disp_unit != sizeof(int))
        errors++;
    if (size != ELEM_PER_PROC * sizeof(int))
        errors++;
    if ((shm_rank == 0) && (base != my_base))
        errors++;
    if (shm_rank && (base == my_base))
        errors++;

    if (verbose) printf("%d -- size = %d baseptr = %p my_baseptr = %p\n", shm_rank,
                            (int) size, (void*) base, (void*) my_base);

    MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win);

    /* Write to all my data */
    for (i = 0; i < ELEM_PER_PROC; i++) {
        my_base[i] = i;
    }

    MPI_Win_sync(shm_win);
    MPI_Barrier(shm_comm);
    MPI_Win_sync(shm_win);

    /* Read and verify everyone's data */
    for (i = 0; i < shm_nproc; i++) {
        for (j = 0; j < ELEM_PER_PROC; j++) {
            if ( base[i*ELEM_PER_PROC + j] != j ) {
                errors++;
                printf("%d -- Got %d at rank %d index %d, expected %d\n", shm_rank,
                       base[i*ELEM_PER_PROC + j], i, j, j);
            }
        }
    }

    MPI_Win_unlock_all(shm_win);
    MPI_Win_free(&shm_win);
    MPI_Comm_free(&shm_comm);

    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    if (rank == 0 && all_errors == 0)
        printf(" No Errors\n");

    MPI_Finalize();

    return 0;
}
コード例 #15
0
int main(int argc, char **argv)
{
    int i, rank, nproc;
    int shm_rank, shm_nproc;
    MPI_Aint size;
    int errors = 0, all_errors = 0;
    int **bases = NULL, *my_base = NULL;
    int disp_unit;
    MPI_Win shm_win = MPI_WIN_NULL, win = MPI_WIN_NULL;
    MPI_Comm shm_comm = MPI_COMM_NULL;
    MPI_Group shm_group = MPI_GROUP_NULL, world_group = MPI_GROUP_NULL;
    int dst_shm_rank, dst_world_rank;
    MPI_Info create_info = MPI_INFO_NULL;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shm_comm);

    MPI_Comm_rank(shm_comm, &shm_rank);
    MPI_Comm_size(shm_comm, &shm_nproc);

    /* Platform does not support shared memory, just return. */
    if (shm_nproc < 2) {
        goto exit;
    }

    /* Specify the last process in the node as the target process */
    dst_shm_rank = shm_nproc - 1;
    MPI_Comm_group(shm_comm, &shm_group);
    MPI_Comm_group(MPI_COMM_WORLD, &world_group);
    MPI_Group_translate_ranks(shm_group, 1, &dst_shm_rank, world_group, &dst_world_rank);

    bases = calloc(shm_nproc, sizeof(int *));

    /* Allocate shm window among local processes, then create a global window with
     * those shm window buffers */
    MPI_Win_allocate_shared(sizeof(int) * ELEM_PER_PROC, sizeof(int), MPI_INFO_NULL,
                            shm_comm, &my_base, &shm_win);
    if (verbose)
        printf("%d -- allocate shared: my_base = %p, absolute base\n", shm_rank, my_base);

    for (i = 0; i < shm_nproc; i++) {
        MPI_Win_shared_query(shm_win, i, &size, &disp_unit, &bases[i]);
        if (verbose)
            printf("%d --    shared query: base[%d]=%p, size %ld, unit %d\n",
                   shm_rank, i, bases[i], size, disp_unit);
    }

#ifdef USE_INFO_ALLOC_SHM
    MPI_Info_create(&create_info);
    MPI_Info_set(create_info, "alloc_shm", "true");
#else
    create_info = MPI_INFO_NULL;
#endif

    MPI_Win_create(my_base, sizeof(int) * ELEM_PER_PROC, sizeof(int), create_info, MPI_COMM_WORLD,
                   &win);

    /* Reset data */
    for (i = 0; i < ELEM_PER_PROC; i++) {
        my_base[i] = 0;
        local_buf[i] = i + 1;
    }

    /* Do RMA through global window, then check value through shared window */
    MPI_Win_lock_all(0, win);
    MPI_Win_lock_all(0, shm_win);

    if (shm_rank == 0) {
        MPI_Put(&local_buf[0], 1, MPI_INT, dst_world_rank, 0, 1, MPI_INT, win);
        MPI_Put(&local_buf[ELEM_PER_PROC - 1], 1, MPI_INT, dst_world_rank, ELEM_PER_PROC - 1, 1,
                MPI_INT, win);
        MPI_Win_flush(dst_world_rank, win);
    }

    MPI_Win_sync(shm_win);
    MPI_Barrier(shm_comm);
    MPI_Win_sync(shm_win);

    if (bases[dst_shm_rank][0] != local_buf[0]) {
        errors++;
        printf("%d -- Got %d at rank %d index %d, expected %d\n", rank,
               bases[dst_shm_rank][0], dst_shm_rank, 0, local_buf[0]);
    }
    if (bases[dst_shm_rank][ELEM_PER_PROC - 1] != local_buf[ELEM_PER_PROC - 1]) {
        errors++;
        printf("%d -- Got %d at rank %d index %d, expected %d\n", rank,
               bases[dst_shm_rank][ELEM_PER_PROC - 1], dst_shm_rank,
               ELEM_PER_PROC - 1, local_buf[ELEM_PER_PROC - 1]);
    }

    MPI_Win_unlock_all(shm_win);
    MPI_Win_unlock_all(win);

    MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

    MPI_Win_free(&win);
    MPI_Win_free(&shm_win);

  exit:
    if (rank == 0 && all_errors == 0)
        printf(" No Errors\n");

    if (create_info != MPI_INFO_NULL)
        MPI_Info_free(&create_info);
    if (shm_comm != MPI_COMM_NULL)
        MPI_Comm_free(&shm_comm);
    if (shm_group != MPI_GROUP_NULL)
        MPI_Group_free(&shm_group);
    if (world_group != MPI_GROUP_NULL)
        MPI_Group_free(&world_group);

    MPI_Finalize();

    if (bases)
        free(bases);

    return 0;
}
コード例 #16
0
ファイル: dart_team_group.c プロジェクト: dash-project/dash
/**
 *  TODO: Differentiate units belonging to team_id and that not
 *  belonging to team_id
 *  within the function or outside it?
 *  FIX: Outside it.
 *
 *  The teamid stands for a superteam related to the new generated
 *  newteam.
 */
dart_ret_t dart_team_create(
  dart_team_t teamid,
  const dart_group_t* group,
  dart_team_t *newteam)
{
  MPI_Comm comm;
  MPI_Comm subcomm;
  MPI_Win win;
  uint16_t index, unique_id;
  size_t size;
  dart_team_t max_teamid = -1;
  dart_unit_t sub_unit, unit;

  dart_myid (&unit);
  dart_size (&size);
  dart_team_myid (teamid, &sub_unit);

  int result = dart_adapt_teamlist_convert (teamid, &unique_id);
  if (result == -1) {
    return DART_ERR_INVAL;
  }
  comm = dart_teams[unique_id];
  subcomm = MPI_COMM_NULL;

  MPI_Comm_create (comm, group -> mpi_group, &subcomm);

  *newteam = DART_TEAM_NULL;

  /* Get the maximum next_availteamid among all the units belonging to
   * the parent team specified by 'teamid'. */
  MPI_Allreduce(
    &dart_next_availteamid,
    &max_teamid,
    1,
    MPI_INT32_T,
    MPI_MAX,
    comm);
  dart_next_availteamid = max_teamid + 1;

  if (subcomm != MPI_COMM_NULL) {
    int result = dart_adapt_teamlist_alloc(max_teamid, &index);
    if (result == -1) {
      return DART_ERR_OTHER;
    }
    /* max_teamid is thought to be the new created team ID. */
    *newteam = max_teamid;
    dart_teams[index] = subcomm;
    MPI_Win_create_dynamic(MPI_INFO_NULL, subcomm, &win);
    dart_win_lists[index] = win;
  }
#if 0
  /* Another way of generating the available teamID for the newly crated team. */
  if (subcomm != MPI_COMM_NULL)
  {
    /* Get the maximum next_availteamid among all the units belonging to the
     * created sub-communicator. */
    MPI_Allreduce (&next_availteamid, &max_teamid, 1, MPI_INT, MPI_MAX, subcomm);
    int result = dart_adapt_teamlist_alloc (max_teamid, &index);

    if (result == -1)
    {
      return DART_ERR_OTHER;
    }

    *newteam = max_teamid;
    teams[index] = subcomm;
    MPI_Comm_rank (subcomm, &rank);

    if (rank == 0)
    {
      root = sub_unit;
      if (sub_unit != 0)
      {
        MPI_Send (&root, 1, MPI_INT, 0, 0, comm);
      }
    }

    next_availteamid = max_teamid + 1;
  }

  if (sub_unit == 0)
  {
    if (root == -1)
    {
      MPI_Recv (&root, 1, MPI_INT, MPI_ANY_SOURCE, 0, comm, MPI_STATUS_IGNORE);
    }
  }

  MPI_Bcast (&root, 1, MPI_INT, 0, comm);

  /* Broadcast the calculated max_teamid to all the units not belonging to the
   * sub-communicator. */
  MPI_Bcast (&max_teamid, 1, MPI_INT, root, comm);
  if (subcomm == MPI_COMM_NULL)
  {
    /* 'Next_availteamid' is changed iff it is smaller than 'max_teamid + 1' */
    if (max_teamid + 1 > next_availteamid)
    {
      next_availteamid = max_teamid + 1;
    }
  }
#endif

  if (subcomm != MPI_COMM_NULL) {
#if !defined(DART_MPI_DISABLE_SHARED_WINDOWS)
    int    i;
    size_t n;

    MPI_Comm sharedmem_comm;
    MPI_Group sharedmem_group, group_all;
    MPI_Comm_split_type(
      subcomm,
      MPI_COMM_TYPE_SHARED,
      1,
      MPI_INFO_NULL,
      &sharedmem_comm);
    dart_sharedmem_comm_list[index] = sharedmem_comm;
    if (sharedmem_comm != MPI_COMM_NULL) {
      MPI_Comm_size(
        sharedmem_comm,
        &(dart_sharedmemnode_size[index]));

  //    dart_unit_mapping[index] = (int*)malloc (
  //      dart_sharedmem_size[index] * sizeof (int));

      MPI_Comm_group(sharedmem_comm, &sharedmem_group);
      MPI_Comm_group(MPI_COMM_WORLD, &group_all);

      int* dart_unit_mapping = (int *)malloc (
        dart_sharedmemnode_size[index] * sizeof (int));
      int* sharedmem_ranks = (int*)malloc (
        dart_sharedmemnode_size[index] * sizeof (int));

      dart_sharedmem_table[index] = (int*)malloc(size * sizeof(int));

      for (i = 0; i < dart_sharedmemnode_size[index]; i++) {
        sharedmem_ranks[i] = i;
      }

  //    MPI_Group_translate_ranks (sharedmem_group, dart_sharedmem_size[index],
  //        sharedmem_ranks, group_all, dart_unit_mapping[index]);
      MPI_Group_translate_ranks(
        sharedmem_group,
        dart_sharedmemnode_size[index],
        sharedmem_ranks,
        group_all,
        dart_unit_mapping);

      for (n = 0; n < size; n++) {
        dart_sharedmem_table[index][n] = -1;
      }
      for (i = 0; i < dart_sharedmemnode_size[index]; i++) {
        dart_sharedmem_table[index][dart_unit_mapping[i]] = i;
      }
      free (sharedmem_ranks);
      free (dart_unit_mapping);
    }

#endif
    MPI_Win_lock_all(0, win);
    DART_LOG_DEBUG ("%2d: TEAMCREATE  - create team %d out of parent team %d",
           unit, *newteam, teamid);
  }
  return DART_OK;
}
コード例 #17
0
ファイル: main.cpp プロジェクト: BoxLib-Codes/BoxLib
int main(int argc, char* argv[])
{
    MPI_Init(&argc, &argv);

    int nprocs, rank;
    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    MPI_Comm team_comm;
    MPI_Group team_grp;

#ifdef MANUAL_SPLIT

    int team_size = nprocs/2;  // There will be two teams.
    assert(team_size*2 == nprocs);

    MPI_Group grp;
    MPI_Comm_group(MPI_COMM_WORLD, &grp);

    int team_ranks[team_size];
    if (rank < team_size) {  // Team 0
	for (int i = 0; i < team_size; ++i) {
	    team_ranks[i] = i;
	}
    } else {                 // Team 1
	for (int i = 0; i < team_size; ++i) {
	    team_ranks[i] = i + team_size;
	}
    }

    MPI_Group_incl(grp, team_size, team_ranks, &team_grp);

    MPI_Comm_create(MPI_COMM_WORLD, team_grp, &team_comm);

#else

    int r = MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, 
                                MPI_INFO_NULL, &team_comm);

    if (r != MPI_SUCCESS) std::cout << "MPI_Comm_split_type failed" << std::endl;
#endif

    int real_team_size;
    int real_team_rank;
    MPI_Comm_size(team_comm, &real_team_size);
    MPI_Comm_rank(team_comm, &real_team_rank);

#if 0
    for (int i = 0; i < nprocs; ++i) {
        if ( i == rank) {
            std::cout << "rank " << rank << ", team_comm size " << real_team_size
		      << ", rank in team " << real_team_rank << std::endl;
        }
	MPI_Barrier(MPI_COMM_WORLD);
    }
#endif

    const int N = 8;
    const int NN = (rank == 0) ? N : 0;
    MPI_Win win_shared;
    double* p;
    MPI_Win_allocate_shared(NN*sizeof(double), sizeof(double),
			    MPI_INFO_NULL, team_comm, &p, &win_shared);

    if (rank != 0) {
	MPI_Aint sz;
	int disp;
	MPI_Win_shared_query(win_shared, MPI_PROC_NULL, &sz, &disp, &p);
    }

    std::atomic_thread_fence(std::memory_order_release);
    MPI_Barrier(team_comm);
    std::atomic_thread_fence(std::memory_order_acquire);

    if (rank == 1) {
	for (int i = 0; i < N; ++i) {
	    p[i] = (double) (i*(rank+1));
	}
    }

    std::atomic_thread_fence(std::memory_order_release);
    MPI_Barrier(team_comm);
    std::atomic_thread_fence(std::memory_order_acquire);    

    for (int i = 0; i < nprocs; ++i) {
        if ( i == rank) {
            std::cout << "rank " << rank << ", data =";
	    for (int j = 0; j < N; ++j) {
		std::cout << " " << p[j];
	    }
	    std::cout << std::endl;
        }
	MPI_Barrier(MPI_COMM_WORLD);
    }

    MPI_Win_free(&win_shared);
#ifdef MANUAL_SPLIT
    MPI_Group_free(&team_grp);
#endif
    MPI_Comm_free(&team_comm);
    MPI_Finalize();
}
コード例 #18
0
int main(int argc, char **argv) {
    int rank, nranks, rank_world, nranks_world;
    int i, j, peer, bufsize, errors;
    double *win_buf, *src_buf, *dst_buf;
    MPI_Win buf_win;
    MPI_Comm shr_comm;

    MTest_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank_world);
    MPI_Comm_size(MPI_COMM_WORLD, &nranks_world);

    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &shr_comm);

    MPI_Comm_rank(shr_comm, &rank);
    MPI_Comm_size(shr_comm, &nranks);

    bufsize = XDIM * YDIM * sizeof(double);
    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf);
    MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf);

    MPI_Win_allocate_shared(bufsize, 1, MPI_INFO_NULL, shr_comm, &win_buf, &buf_win);

    MPI_Win_fence(0, buf_win);

    for (i = 0; i < XDIM*YDIM; i++) {
        *(win_buf + i) = -1.0;
        *(src_buf + i) =  1.0 + rank;
    }

    MPI_Win_fence(0, buf_win);

    peer = (rank+1) % nranks;

    /* Perform ITERATIONS strided accumulate operations */

    for (i = 0; i < ITERATIONS; i++) {
        int idx_rem[SUB_YDIM];
        int blk_len[SUB_YDIM];
        MPI_Datatype src_type, dst_type;

        for (j = 0; j < SUB_YDIM; j++) {
            idx_rem[j] = j*XDIM;
            blk_len[j] = SUB_XDIM;
        }

        MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &src_type);
        MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &dst_type);

        MPI_Type_commit(&src_type);
        MPI_Type_commit(&dst_type);

        /* PUT */
        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
        MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0,
                           1, dst_type, MPI_REPLACE, buf_win);
        MPI_Win_unlock(peer, buf_win);

        /* GET */
        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win);
        MPI_Get_accumulate(src_buf, 1, src_type, dst_buf, 1, src_type, peer, 0,
                           1, dst_type, MPI_NO_OP, buf_win);
        MPI_Win_unlock(peer, buf_win);

        MPI_Type_free(&src_type);
        MPI_Type_free(&dst_type);
    }

    MPI_Barrier(MPI_COMM_WORLD);

    /* Verify that the results are correct */

    MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win);
    errors = 0;
    for (i = 0; i < SUB_XDIM; i++) {
        for (j = 0; j < SUB_YDIM; j++) {
            const double actual   = *(win_buf + i + j*XDIM);
            const double expected = (1.0 + ((rank+nranks-1)%nranks));
            if (fabs(actual - expected) > 1.0e-10) {
                SQUELCH( printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n",
                                rank, j, i, expected, actual); );
                errors++;
                fflush(stdout);
            }
        }
コード例 #19
0
ファイル: win_large_shm.c プロジェクト: jeffhammond/mpich
int main(int argc, char **argv)
{
    int my_rank, shared_rank;
    void *mybase = NULL;
    MPI_Win win;
    MPI_Info win_info;
    MPI_Comm shared_comm;
    int i;
    int shm_win_size = 1024 * 1024 * 1024 * sizeof(char);       /* 1GB */

    MTest_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    for (i = 0; i < 2; i++) {
        if (i == 0) {
            MPI_Info_create(&win_info);
            MPI_Info_set(win_info, (char *) "alloc_shm", (char *) "true");
        } else {
            win_info = MPI_INFO_NULL;
        }

        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, my_rank, MPI_INFO_NULL,
                            &shared_comm);

        MPI_Comm_rank(shared_comm, &shared_rank);

        /* every processes allocate 1GB window memory */
        MPI_Win_allocate(shm_win_size, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win);

        MPI_Win_free(&win);

        MPI_Win_allocate_shared(shm_win_size, sizeof(char), win_info, shared_comm, &mybase, &win);

        MPI_Win_free(&win);

        /* some processes allocate 1GB and some processes allocate zero bytes */
        if (my_rank % 2 == 0)
            MPI_Win_allocate(shm_win_size, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win);
        else
            MPI_Win_allocate(0, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win);

        MPI_Win_free(&win);

        if (shared_rank % 2 == 0)
            MPI_Win_allocate_shared(shm_win_size, sizeof(char), win_info, shared_comm, &mybase,
                                    &win);
        else
            MPI_Win_allocate_shared(0, sizeof(char), win_info, shared_comm, &mybase, &win);

        MPI_Win_free(&win);

        /* some processes allocate 1GB and some processes allocate smaller bytes */
        if (my_rank % 2 == 0)
            MPI_Win_allocate(shm_win_size, sizeof(char), win_info, MPI_COMM_WORLD, &mybase, &win);
        else
            MPI_Win_allocate(shm_win_size / 2, sizeof(char), win_info, MPI_COMM_WORLD, &mybase,
                             &win);

        MPI_Win_free(&win);

        /* some processes allocate 1GB and some processes allocate smaller bytes */
        if (shared_rank % 2 == 0)
            MPI_Win_allocate_shared(shm_win_size, sizeof(char), win_info, shared_comm, &mybase,
                                    &win);
        else
            MPI_Win_allocate_shared(shm_win_size / 2, sizeof(char), win_info, shared_comm, &mybase,
                                    &win);

        MPI_Win_free(&win);

        MPI_Comm_free(&shared_comm);

        if (i == 0)
            MPI_Info_free(&win_info);
    }

    MTest_Finalize(0);

    return 0;
}
コード例 #20
0
ファイル: main.c プロジェクト: sirianray/lblc
int main(int argc, char *argv[]){
	MPI_Init(&argc, &argv);
        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
        MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
        MPI_Comm_rank(MPI_COMM_WORLD, &myid);

	double t_begin;
	int i, j, k, ii, id, id1;

	t_begin = MPI_Wtime();		// record the begining CPU time
	
	read_param();
	lattice_vec();
	allocate();
	p_allocate();

	if (flow_on!=0) {
		build_stream();
	}

	if (Q_on!=0) {
		build_neighbor();		
	}

	init_surf();
	add_patch();
	p_init();
	p_iden();
	init();

	if (flow_on!=0) cal_fequ(f);	
	if (Q_on!=0) cal_dQ();

	if (Q_on!=0 && flow_on!=0 && newrun_on!=0) {
		while(qconverge==0) {
			t_current++;
			for (ii=0; ii<n_evol_Q; ii++) {
                                cal_dQ();
                                evol_Q();
                        }
			if (t_current%t_print==0) monitor();
		}
		e_tot     =-1;
		k_eng     =-1;
		qconverge = 0;	
		uconverge = 0;
		t_current =-1;
	}

	if (Q_on!=0 && flow_on!=0) {
		cal_W();
		cal_stress();
		cal_sigma_p();
	}
        MPI_Barrier(MPI_COMM_WORLD);

	output1(1,'z',Nx/2,Ny/2);
	output3(1);
	if(myid==0) printf("Q initialized\n");
	MPI_Barrier(MPI_COMM_WORLD);

	while (t_current<t_max && uconverge*qconverge==0) {
		e_toto=e_tot;
		if (Q_on!=0 && qconverge==0) {
			if (flow_on!=0 && uconverge==0) cal_W();
			for (ii=0; ii<n_evol_Q; ii++) {
				cal_dQ();
				evol_Q();
			}			
		}

		if (flow_on!=0 && uconverge==0) {
			if (Q_on!=0 && qconverge==0) {
				cal_stress();
				cal_sigma_p();
			}
			evol_f(f,f2);
		}

		if (Q_on!=0 && qconverge==0) {
			if (flow_on!=0 && uconverge==0) cal_W();
			for (ii=0; ii<n_evol_Q; ii++) {
				cal_dQ();
				evol_Q();
			}			
		}

		if (flow_on!=0 && uconverge==0) {
			if (Q_on!=0 && qconverge==0) {
				cal_stress();
				cal_sigma_p();
			}
			evol_f(f2,f);
		}
		
		if (t_current%t_print==0) monitor();
		if (t_current%t_write==0) {
			output1(0,'z',Nx/2,Ny/2);
			output3(0);
			fflush(stdout);
		}
		t_current++;
	}
	
	
	output_time(t_begin);
	output1(0,'z',Nx/2,Ny/2);
	output3(0);
	
	write_restart();

	p_deallocate();
	deallocate();

	return 0;
}
コード例 #21
0
ファイル: stencil.c プロジェクト: elliottslaughter/Kernels
int main(int argc, char ** argv) {

  int    Num_procs;       /* number of ranks                                     */
  int    Num_procsx,
         Num_procsy;      /* number of ranks in each coord direction             */
  int    Num_groupsx,
         Num_groupsy;     /* number of blocks in each coord direction            */
  int    my_group;        /* sequence number of shared memory block              */
  int    my_group_IDx,
         my_group_IDy;    /* coordinates of block within block grid              */
  int    group_size;      /* number of ranks in shared memory group              */
  int    group_sizex,
         group_sizey;     /* number of ranks in block in each coord direction    */
  int    my_ID;           /* MPI rank                                            */
  int    my_global_IDx,
         my_global_IDy;   /* coordinates of rank in overall rank grid            */
  int    my_local_IDx,
         my_local_IDy;    /* coordinates of rank within shared memory block      */
  int    right_nbr;       /* global rank of right neighboring tile               */
  int    left_nbr;        /* global rank of left neighboring tile                */
  int    top_nbr;         /* global rank of top neighboring tile                 */
  int    bottom_nbr;      /* global rank of bottom neighboring tile              */
  int    local_nbr[4];    /* list of synchronizing local neighbors               */
  int    num_local_nbrs;  /* number of synchronizing local neighbors             */
  int    dummy;
  DTYPE *top_buf_out;     /* communication buffer                                */
  DTYPE *top_buf_in;      /*       "         "                                   */
  DTYPE *bottom_buf_out;  /*       "         "                                   */
  DTYPE *bottom_buf_in;   /*       "         "                                   */
  DTYPE *right_buf_out;   /*       "         "                                   */
  DTYPE *right_buf_in;    /*       "         "                                   */
  DTYPE *left_buf_out;    /*       "         "                                   */
  DTYPE *left_buf_in;     /*       "         "                                   */
  int    root = 0;
  long   n, width, height;/* linear global and block grid dimension              */
  int    width_rank,
         height_rank;     /* linear local dimension                              */
  int    iter, leftover;  /* dummies                   */
  int    istart_rank,
         iend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    jstart_rank,
         jend_rank;       /* bounds of grid tile assigned to calling rank        */
  int    istart, iend;    /* bounds of grid block containing tile                */
  int    jstart, jend;    /* bounds of grid block containing tile                */
  DTYPE  norm,            /* L1 norm of solution                                 */
         local_norm,      /* contribution of calling rank to L1 norm             */
         reference_norm;  /* value to be matched by computed norm                */
  DTYPE  f_active_points; /* interior of grid with respect to stencil            */
  DTYPE  flops;           /* floating point ops per iteration                    */
  int    iterations;      /* number of times to run the algorithm                */
  double local_stencil_time,/* timing parameters                                 */
         stencil_time,
         avgtime;
  int    stencil_size;    /* number of points in stencil                         */
  DTYPE  * RESTRICT in;   /* input grid values                                   */
  DTYPE  * RESTRICT out;  /* output grid values                                  */
  long   total_length_in; /* total required length to store input array          */
  long   total_length_out;/* total required length to store output array         */
  int    error=0;         /* error flag                                          */
  DTYPE  weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil     */
  MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */
  MPI_Win shm_win_in;     /* shared memory window object for IN array            */
  MPI_Win shm_win_out;    /* shared memory window object for OUT array           */
  MPI_Comm shm_comm_prep; /* preparatory shared memory communicator              */
  MPI_Comm shm_comm;      /* Shared Memory Communicator                          */
  int shm_procs;          /* # of rankes in shared domain                        */
  int shm_ID;             /* MPI rank in shared memory domain                    */
  MPI_Aint size_in;       /* size of the IN array in shared memory window        */
  MPI_Aint size_out;      /* size of the OUT array in shared memory window       */
  int size_mul;           /* one for shm_comm root, zero for the other ranks     */
  int disp_unit;          /* ignored                                             */

  /*******************************************************************************
  ** Initialize the MPI environment
  ********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  /*******************************************************************************
  ** process, test, and broadcast input parameters
  ********************************************************************************/

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM stencil execution on 2D grid\n");

#if !STAR
      printf("ERROR: Compact stencil not supported\n");
      error = 1;
      goto ENDOFTESTS;
#endif

    if (argc != 4){
      printf("Usage: %s  <#ranks per coherence domain><# iterations> <array dimension> \n",
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    }
    if (Num_procs%group_size) {
      printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    }

    iterations  = atoi(*++argv);
    if (iterations < 0){
      printf("ERROR: iterations must be >= 0 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    }

    n  = atol(*++argv);
    long nsquare = n * n;
    if (nsquare < Num_procs){
      printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare);
      error = 1;
      goto ENDOFTESTS;
    }

    if (RADIUS < 0) {
      printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS);
      error = 1;
      goto ENDOFTESTS;
    }

    if (2*RADIUS +1 > n) {
      printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n);
      error = 1;
      goto ENDOFTESTS;
    }

    ENDOFTESTS:;
  }
  bail_out(error);

  MPI_Bcast(&n,          1, MPI_LONG, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);

  /* determine best way to create a 2D grid of ranks (closest to square, for
     best surface/volume ratio); we do this brute force for now. The
     decomposition needs to be such that shared memory groups can evenly
     tessellate the rank grid
  */
  for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) {
    if (!(Num_procs%Num_procsx)) {
      Num_procsy = Num_procs/Num_procsx;
      for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) {
        if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) {
          group_sizey=group_size/group_sizex;
          break;
        }
      }
      if (!(Num_procsy%group_sizey)) break;
    }
  }


  if (my_ID == root) {
    printf("Number of ranks                 = %d\n", Num_procs);
    printf("Grid size                       = %ld\n", n);
    printf("Radius of stencil               = %d\n", RADIUS);
    printf("Tiles in x/y-direction          = %d/%d\n", Num_procsx, Num_procsy);
    printf("Tiles per shared memory domain  = %d\n", group_size);
    printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex,  group_sizey);
    printf("Type of stencil                 = star\n");
#if LOCAL_BARRIER_SYNCH
    printf("Local synchronization           = barrier\n");
#else
    printf("Local synchronization           = point to point\n");
#endif
#if DOUBLE
    printf("Data type                       = double precision\n");
#else
    printf("Data type                       = single precision\n");
#endif
#if LOOPGEN
    printf("Script used to expand stencil loop body\n");
#else
    printf("Compact representation of stencil loop body\n");
#endif
    printf("Number of iterations            = %d\n", iterations);
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that an SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);

  Num_groupsx = Num_procsx/group_sizex;
  Num_groupsy = Num_procsy/group_sizey;

  my_group = my_ID/group_size;
  my_group_IDx = my_group%Num_groupsx;
  my_group_IDy = my_group/Num_groupsx;
  my_local_IDx = my_ID%group_sizex;
  my_local_IDy = (my_ID%group_size)/group_sizex;
  my_global_IDx = my_group_IDx*group_sizex+my_local_IDx;
  my_global_IDy = my_group_IDy*group_sizey+my_local_IDy;

  /* set all neighboring ranks to -1 (no communication with those ranks) */
  left_nbr = right_nbr = top_nbr = bottom_nbr = -1;
  /* keep track of local neighbors for local synchronization             */
  num_local_nbrs = 0;

  if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) {
    right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1;
  }
  if (my_local_IDx != group_sizex-1) {
    local_nbr[num_local_nbrs++] = shm_ID + 1;
  }

  if (my_local_IDx == 0 && my_group_IDx != 0) {
    left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1;
  }
  if (my_local_IDx != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - 1;
  }

  if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) {
    top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx;
  }
  if (my_local_IDy != group_sizey-1) {
    local_nbr[num_local_nbrs++] = shm_ID + group_sizex;
  }

  if (my_local_IDy == 0 && my_group_IDy != 0) {
    bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx;
  }
  if (my_local_IDy != 0) {
    local_nbr[num_local_nbrs++] = shm_ID - group_sizex;
  }

  /* compute amount of space required for input and solution arrays for the block,
     and also compute index sets                                                  */

  width = n/Num_groupsx;
  leftover = n%Num_groupsx;
  if (my_group_IDx<leftover) {
    istart = (width+1) * my_group_IDx;
    iend = istart + width;
  }
  else {
    istart = (width+1) * leftover + width * (my_group_IDx-leftover);
    iend = istart + width - 1;
  }

  width = iend - istart + 1;
  if (width == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);

  height = n/Num_groupsy;
  leftover = n%Num_groupsy;
  if (my_group_IDy<leftover) {
    jstart = (height+1) * my_group_IDy;
    jend = jstart + height;
  }
  else {
    jstart = (height+1) * leftover + height * (my_group_IDy-leftover);
    jend = jstart + height - 1;
  }

  height = jend - jstart + 1;
  if (height == 0) {
    printf("ERROR: rank %d has no work to do\n", my_ID);
    error = 1;
  }
  bail_out(error);

  if (width < RADIUS || height < RADIUS) {
    printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n",
           my_ID, width, height);
    error = 1;
  }
  bail_out(error);

  total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE);
  total_length_out = width*height*sizeof(DTYPE);

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);
  size_in= total_length_in*size_mul;
  MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm,
                          (void *) &in, &shm_win_in);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in);
  MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in);
  if (in == NULL){
    printf("Error allocating space for input array by group %d\n",my_group);
    error = 1;
  }
  bail_out(error);

  size_out= total_length_out*size_mul;
  MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm,
                          (void *) &out, &shm_win_out);
  MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out);
  MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out);
  if (out == NULL){
    printf("Error allocating space for output array by group %d\n", my_group);
    error = 1;
  }
  bail_out(error);

  /* determine index set assigned to each rank                         */

  width_rank = width/group_sizex;
  leftover = width%group_sizex;
  if (my_local_IDx<leftover) {
    istart_rank = (width_rank+1) * my_local_IDx;
    iend_rank = istart_rank + width_rank;
  }
  else {
    istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover);
    iend_rank = istart_rank + width_rank - 1;
  }
  istart_rank += istart;
  iend_rank += istart;
  width_rank = iend_rank - istart_rank + 1;

  height_rank = height/group_sizey;
  leftover = height%group_sizey;
  if (my_local_IDy<leftover) {
    jstart_rank = (height_rank+1) * my_local_IDy;
    jend_rank = jstart_rank + height_rank;
  }
  else {
    jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover);
    jend_rank = jstart_rank + height_rank - 1;
  }
  jstart_rank+=jstart;
  jend_rank+=jstart;
  height_rank = jend_rank - jstart_rank + 1;

  if (height_rank*width_rank==0) {
    error = 1;
    printf("Rank %d has no work to do\n", my_ID);
  }
  bail_out(error);

  /* allocate communication buffers for halo values                            */
  top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width_rank);
  if (!top_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  top_buf_in     = top_buf_out +   RADIUS*width_rank;
  bottom_buf_out = top_buf_out + 2*RADIUS*width_rank;
  bottom_buf_in  = top_buf_out + 3*RADIUS*width_rank;

  right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height_rank);
  if (!right_buf_out) {
    printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID);
    error = 1;
  }
  bail_out(error);
  right_buf_in   = right_buf_out +   RADIUS*height_rank;
  left_buf_out   = right_buf_out + 2*RADIUS*height_rank;
  left_buf_in    = right_buf_out + 3*RADIUS*height_rank;

    /* fill the stencil weights to reflect a discrete divergence operator         */
  for (int jj=-RADIUS; jj<=RADIUS; jj++) for (int ii=-RADIUS; ii<=RADIUS; ii++)
    WEIGHT(ii,jj) = (DTYPE) 0.0;
  stencil_size = 4*RADIUS+1;
  for (int ii=1; ii<=RADIUS; ii++) {
    WEIGHT(0, ii) = WEIGHT( ii,0) =  (DTYPE) (1.0/(2.0*ii*RADIUS));
    WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS));
  }

  norm = (DTYPE) 0.0;
  f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS);
  /* intialize the input and output arrays                                     */
  for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) {
    IN(i,j)  = COEFX*i+COEFY*j;
    OUT(i,j) = (DTYPE)0.0;
  }

  /* LOAD/STORE FENCE */
  MPI_Win_sync(shm_win_in);
  MPI_Win_sync(shm_win_out);
  MPI_Barrier(shm_comm);

  for (iter = 0; iter<=iterations; iter++){

    /* start timer after a warmup iteration */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_stencil_time = wtime();
    }

    /* need to fetch ghost point data from neighbors in y-direction                 */
    if (top_nbr != -1) {
      MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101,
                MPI_COMM_WORLD, &(request[1]));
      for (int kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        top_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99,
                MPI_COMM_WORLD, &(request[0]));
    }

    if (bottom_nbr != -1) {
      MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99,
                MPI_COMM_WORLD, &(request[3]));
      for (int kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        bottom_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101,
 	  MPI_COMM_WORLD, &(request[2]));
      }

    if (top_nbr != -1) {
      MPI_Wait(&(request[0]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[1]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = top_buf_in[kk++];
      }
    }

    if (bottom_nbr != -1) {
      MPI_Wait(&(request[2]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[3]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++)
      for (int i=istart_rank; i<=iend_rank; i++) {
        IN(i,j) = bottom_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* need to fetch ghost point data from neighbors in x-direction                 */
    if (right_nbr != -1) {
      MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010,
                MPI_COMM_WORLD, &(request[1+4]));
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=iend_rank-RADIUS+1; i<=iend_rank; i++) {
        right_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990,
                MPI_COMM_WORLD, &(request[0+4]));
    }

    if (left_nbr != -1) {
      MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990,
                MPI_COMM_WORLD, &(request[3+4]));
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=istart_rank; i<=istart_rank+RADIUS-1; i++) {
        left_buf_out[kk++]= IN(i,j);
      }
      MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010,
                MPI_COMM_WORLD, &(request[2+4]));
    }

    if (right_nbr != -1) {
      MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=iend_rank+1; i<=iend_rank+RADIUS; i++) {
        IN(i,j) = right_buf_in[kk++];
      }
    }

    if (left_nbr != -1) {
      MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE);
      MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE);
      for (int kk=0,j=jstart_rank; j<=jend_rank; j++)
      for (int i=istart_rank-RADIUS; i<=istart_rank-1; i++) {
        IN(i,j) = left_buf_in[kk++];
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

    /* Apply the stencil operator */
    for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
      for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
        #if LOOPGEN
          #include "loop_body_star.incl"
        #else
          for (int jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj);
          for (int ii=-RADIUS; ii<0; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
          for (int ii=1; ii<=RADIUS; ii++)       OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j);
        #endif
      }
    }

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_out);

#if LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it
#else
    for (int i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE);
#endif

    /* add constant to solution to force refresh of neighbor data, if any */
    for (int j=jstart_rank; j<=jend_rank; j++)
    for (int i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0;

    /* LOAD/STORE FENCE */
    MPI_Win_sync(shm_win_in);

#if LOCAL_BARRIER_SYNCH
    MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it
#else
    for (int i=0; i<num_local_nbrs; i++) {
      MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i]));
      MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm);
    }
    MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE);
#endif

  } /* end of iterations                                                   */

  local_stencil_time = wtime() - local_stencil_time;
  MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  /* compute L1 norm in parallel                                                */
  local_norm = (DTYPE) 0.0;
  for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) {
    for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) {
      local_norm += (DTYPE)ABS(OUT(i,j));
    }
  }

  MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD);

  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/

/* verify correctness                                                            */
  if (my_ID == root) {
    norm /= f_active_points;
    if (RADIUS > 0) {
      reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY);
    }
    else {
      reference_norm = (DTYPE) 0.0;
    }
    if (ABS(norm-reference_norm) > EPSILON) {
      printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n",
             norm, reference_norm);
      error = 1;
    }
    else {
      printf("Solution validates\n");
#if VERBOSE
      printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n",
             reference_norm, norm);
#endif
    }
  }
  bail_out(error);

  MPI_Win_unlock_all(shm_win_in);
  MPI_Win_unlock_all(shm_win_out);
  MPI_Win_free(&shm_win_in);
  MPI_Win_free(&shm_win_out);

  if (my_ID == root) {
    /* flops/stencil: 2 flops (fma) for each point in the stencil,
       plus one flop for the update of the input of the array        */
    flops = (DTYPE) (2*stencil_size+1) * f_active_points;
    avgtime = stencil_time/iterations;
    printf("Rate (MFlops/s): "FSTR"  Avg time (s): %lf\n",
           1.0E-06 * flops/avgtime, avgtime);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);
}
コード例 #22
0
int OSPU_Comm_split_node(MPI_Comm oldcomm, MPI_Comm * newcomm)
{
    int rc = MPI_SUCCESS;

#if MPI_VERSION >= 3

    rc = MPI_Comm_split_type(oldcomm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, newcomm);
    if (rc!=MPI_SUCCESS) return rc;

#elif defined(MPICH2) && (MPICH2_NUMVERSION>10500000)

    rc = MPIX_Comm_split_type(oldcomm, MPIX_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, newcomm);
    if (rc!=MPI_SUCCESS) return rc;

#else

    /* This code was authored by Jim Dinan */

    char my_name[MPI_MAX_PROCESSOR_NAME];
    MPI_Comm node_comm = MPI_COMM_NULL;
    MPI_Comm parent_comm;
    int len;

    /* Dup so we don't leak communicators */
    rc = MPI_Comm_dup(oldcomm, &parent_comm);
    if (rc!=MPI_SUCCESS) return rc;

    rc = MPI_Get_processor_name(my_name, &len);
    if (rc!=MPI_SUCCESS) return rc;

    while (node_comm == MPI_COMM_NULL)
    {
        char root_name[MPI_MAX_PROCESSOR_NAME];
        int  rank;
        MPI_Comm old_parent;

        rc = MPI_Comm_rank(parent_comm, &rank);
        if (rc!=MPI_SUCCESS) return rc;

        if (rank == 0)
        {
            rc = MPI_Bcast(my_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, parent_comm);
            if (rc!=MPI_SUCCESS) return rc;
            strncpy(root_name, my_name, MPI_MAX_PROCESSOR_NAME);
        } 
        else 
        {
            rc = MPI_Bcast(root_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0, parent_comm);
            if (rc!=MPI_SUCCESS) return rc;
        }

        old_parent = parent_comm;

        if (strncmp(my_name, root_name, MPI_MAX_PROCESSOR_NAME) == 0)
        {
            /* My group splits off, I'm done after this */
            rc = MPI_Comm_split(parent_comm, 1, rank, &node_comm);
            if (rc!=MPI_SUCCESS) return rc;
        }
        else
        {
            /* My group keeps going, separate from the others */
            rc = MPI_Comm_split(parent_comm, 0, rank, &parent_comm);
            if (rc!=MPI_SUCCESS) return rc;
        }

        /* Old parent is no longer needed */
        rc = MPI_Comm_free(&old_parent);
        if (rc!=MPI_SUCCESS) return rc;
    }

    *newcomm = node_comm;

#endif

    return rc = MPI_SUCCESS;
}
コード例 #23
0
ファイル: mutex_bench.c プロジェクト: NexMirror/MPICH
int main(int argc, char **argv)
{
    int rank, nproc, i;
    double t_mcs_mtx;
    MPI_Comm mtx_comm;
    MCS_Mutex mcs_mtx;

    MPI_Init(&argc, &argv);

    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);

#ifdef USE_WIN_SHARED
    MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &mtx_comm);
#else
    mtx_comm = MPI_COMM_WORLD;
#endif

    MCS_Mutex_create(0, mtx_comm, &mcs_mtx);

    MPI_Barrier(MPI_COMM_WORLD);
    t_mcs_mtx = MPI_Wtime();

    for (i = 0; i < NUM_ITER; i++) {
        /* Combining trylock and lock here is helpful for testing because it makes
         * CAS and Fetch-and-op contend for the tail pointer. */
#ifdef USE_CONTIGUOUS_RANK
        if (rank < nproc / 2) {
#else
        if (rank % 2) {
#endif
            int success = 0;
            while (!success) {
                MCS_Mutex_trylock(mcs_mtx, &success);
            }
        }
        else {
            MCS_Mutex_lock(mcs_mtx);
        }
        MCS_Mutex_unlock(mcs_mtx);
    }

    MPI_Barrier(MPI_COMM_WORLD);
    t_mcs_mtx = MPI_Wtime() - t_mcs_mtx;

    MCS_Mutex_free(&mcs_mtx);

    if (rank == 0) {
        if (verbose) {
            printf("Nproc %d, MCS Mtx = %f us\n", nproc, t_mcs_mtx / NUM_ITER * 1.0e6);
        }
    }

    if (mtx_comm != MPI_COMM_WORLD)
        MPI_Comm_free(&mtx_comm);

    MTest_Finalize(0);
    MPI_Finalize();

    return 0;
}
コード例 #24
0
  /*        MAIN    */
  int
  main (int argc, char *argv[])
  {
    /* to be used for hello world exchanges */
    int rank, numtasks, namelen;
    char name[MPI_MAX_PROCESSOR_NAME];

    /* related to MPI-3 shm*/
    MPI_Comm shmcomm; /* shm communicator  */ 
    MPI_Win win;      /* shm window object */ 
    int *mem;         /* shm memory to be allocated on each node */
    int i0, i1; int* i2; /* for reading back from shm */

    /* current rank exchanges hello world info with partners */
    int partners[n_partners];
    int *partners_map;   /* mapping in shm communicator */
    int **partners_ptrs; /* ptrs to shared mem window for each partner*/
    int j, partner, alloc_len;
    int n_node_partners=0, n_inter_partners=0;

     /* non-blocking inter-node */
    MPI_Request *reqs, *rq;
    int rbuf[n_partners]; /* recv buffer */
    int req_num = 2;      /* each inter-node echange needs a pair of MPI_Irecv and MPI_Isend */

    if (getenv("1DRING_VERBOSE")) verbose = 1; /* Switch on/off printfs thru env */

    MPI_Init (&argc, &argv); 
    MPI_Comm_size (MPI_COMM_WORLD, &numtasks);
    MPI_Comm_rank (MPI_COMM_WORLD, &rank);
    MPI_Get_processor_name (name, &namelen);
    /* if (verbose) printf ("Hello world from COMM_WORLD: rank %d of %d is running on %s\n", rank, numtasks, name); */
    

    /* The 1D ring is defined in partners array. It can be easily expanded to the higher order stencils.
       The current rank has 2 neighbours: previous and next, i.e., prev-rank-next */
    partners[0] = rank-1; /* prev */
    partners[1] = rank+1; /* next */
    /* We will use periodic boundary conditions here */
    if (rank == 0)  partners[0] = numtasks - 1;
    if (rank == (numtasks - 1))  partners[1] = 0;

    /* MPI-3 SHM collective creates shm communicator  */
    MPI_Comm_split_type (MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm); 

   /*  mapping: global rank  -> shmcomm rank is in partners_map */
    partners_map = (int*)malloc(n_partners*sizeof(int)); /* allocate partners_map */
    translate_ranks(shmcomm, partners, partners_map);

   /* number of inter and intra node partners */
    get_n_partners (rank, partners, partners_map,  &n_node_partners, &n_inter_partners); 
    if (verbose) print_n_partners (rank, partners, partners_map,  n_node_partners, n_inter_partners); 

    
    alloc_len = 2*sizeof(int) + namelen+1; /* the size of hello world info: 2 int and string; +1 for '\n' */
    if (n_node_partners > 0)
    {
     /* allocate shared memory windows on each node for intra-node partners  */
     MPI_Win_allocate_shared (alloc_len, 1, MPI_INFO_NULL, shmcomm, /* inputs to MPI-3 SHM collective */
                              &mem, &win);  /* outputs: mem - initial address of window; win - window object */

     /* pointers to mem windows */
     partners_ptrs = (int **)malloc(n_partners*sizeof(int*));  
     get_partners_ptrs (win, partners, partners_map,  partners_ptrs );

    }
    else
    {
       mem = (int *)malloc(alloc_len); 
    }    

    /* allocate MPI Request resources for inter-node comm. */
    if(n_inter_partners > 0)
    {
        reqs = (MPI_Request*)malloc(req_num*n_inter_partners*sizeof(MPI_Request));
        rq = reqs;
    }

    /* start halo exchange */

    if (n_node_partners > 0)
    {    
        /* Entering MPI-3 RMA access epoch required for MPI-3 shm */
        MPI_Win_lock_all (MPI_MODE_NOCHECK, win); 
        /*  alternatively, MPI_Win_lock_all, MPI_Win_sync and MPI_Barrier can be replaced with
        2 MPI_Win_fence calls surrounding update of shared memory.  */
        /* MPI_Win_fence(0, win); */ /* -- alternative */
    }    

    /* update MPI-3 shared memory (or local memory in case of lack of node partners) 
     * by writing hello_world info into mem */
    mem[0] = rank; 
    mem[1] = numtasks;
    memcpy(mem+2, name, namelen);

    if (n_node_partners > 0)
    {    
        /* MPI_Win_fence (0, win); */ /* -- alternative end */

        MPI_Win_sync (win);     /* memory fence to sync node exchanges */ 
        MPI_Barrier (shmcomm);  /* time barrier to make sure all ranks have updated their info */
    }


    for (j=0; j<n_partners; j++) 
    {
        if(partners_map[j] != MPI_UNDEFINED) /* partner j is on the same node  */ 
        {
            i0 = partners_ptrs[j][0]; /* load from MPI-3/SHM ops! */
            i1 = partners_ptrs[j][1];
            i2 = partners_ptrs[j]+2; 
            if(verbose) printf ("load MPI/SHM values from neighbour => rank %d, numtasks %d on %s\n", i0, i1, i2);
        }
        else /* inter-node non-blocking MPI-1 */
        {
            MPI_Irecv (&rbuf[j], 1, MPI_INT, partners[j], 1 , MPI_COMM_WORLD, rq++);
            MPI_Isend (&rank, 1, MPI_INT, partners[j], 1 , MPI_COMM_WORLD, rq++);
        }
    } 

   /* sync inter-node exchanges and print out receive buffer rbuf*/
   if(n_inter_partners > 0)
   {
       MPI_Waitall (req_num*n_inter_partners, reqs, MPI_STATUS_IGNORE); 
       if(verbose){
           for (j =0; j< n_partners;j++)
               if (partners_map[j] == MPI_UNDEFINED) printf("Recieved from my inter-node partner %d\n", rbuf[j]);
       }
   }    
   
   if (n_node_partners > 0)
   {    
       MPI_Win_unlock_all (win);  /* close RMA epoch */
       /* free resources */
       MPI_Win_free (&win);
       free (partners_ptrs); 
   } 
   
   if (n_inter_partners) free (reqs);
   
   free (partners_map);  
   
   MPI_Finalize ();

   return (0);
 }
コード例 #25
0
ファイル: transpose.c プロジェクト: nchaimov/ParResKernels
int main(int argc, char ** argv)
{
  int Block_order;
  size_t Block_size;
  size_t Colblock_size;
  int Tile_order=32;
  int tiling;
  int Num_procs;     /* Number of ranks                                          */
  int order;         /* overall matrix order                                     */
  int send_to, recv_from; /* communicating ranks                                 */
  size_t bytes;      /* total amount of data to be moved                         */
  int my_ID;         /* rank                                                     */
  int root=0;        /* root rank of a communicator                              */
  int iterations;    /* number of times to run the pipeline algorithm            */
  int i, j, it, jt, ID;/* dummies                                                */
  int iter;          /* index of iteration                                       */
  int phase;         /* phase in the staged communication                        */
  size_t colstart;   /* sequence number of first column owned by calling rank    */
  int error=0;       /* error flag                                               */
  double *A_p;       /* original matrix column block                             */
  double *B_p;       /* transposed matrix column block                           */
  double *Work_in_p; /* workspace for the transpose function                     */
  double *Work_out_p;/* workspace for the transpose function                     */
  double abserr, abserr_tot; /* computed error                                   */
  double epsilon = 1.e-8; /* error tolerance                                     */
  double local_trans_time, /* timing parameters                                  */
         trans_time,
         avgtime;
  MPI_Status status; /* completion status of message                             */
  MPI_Win shm_win_A; /* Shared Memory window object                              */
  MPI_Win shm_win_B; /* Shared Memory window object                              */
  MPI_Win shm_win_Work_in; /* Shared Memory window object                        */
  MPI_Win shm_win_Work_out; /* Shared Memory window object                       */
  MPI_Info rma_winfo;/* info for window                                          */
  MPI_Comm shm_comm_prep;/* Shared Memory prep Communicator                      */
  MPI_Comm shm_comm; /* Shared Memory Communicator                               */
  int shm_procs;     /* # of ranks in shared domain                              */
  int shm_ID;        /* MPI rank within coherence domain                         */
  int group_size;    /* number of ranks per shared memory group                  */
  int Num_groups;    /* number of shared memory group                            */
  int group_ID;      /* sequence number of shared memory group                   */
  int size_mul;      /* size multiplier; 0 for non-root ranks in coherence domain*/
  int istart;
  MPI_Request send_req, recv_req;

/*********************************************************************************
** Initialize the MPI environment
**********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

  root = 0;

/*********************************************************************
** process, test and broadcast input parameter
*********************************************************************/

  if (my_ID == root){
    if (argc != 4 && argc !=5){
      printf("Usage: %s  <#ranks per coherence domain> <# iterations> <matrix order> [tile size]\n", 
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    group_size = atoi(*++argv);
    if (group_size < 1) {
      printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size);
      error = 1;
      goto ENDOFTESTS;
    } 
    if (Num_procs%group_size) {
      printf("ERROR: toal # %d ranks not divisible by ranks per coherence domain %d\n",
	     Num_procs, group_size);
      error = 1;
      goto ENDOFTESTS;
    } 

    iterations = atoi(*++argv);
    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    } 

    order = atoi(*++argv);
    if (order < Num_procs) {
      printf("ERROR: matrix order %d should at least # procs %d\n", 
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (order%Num_procs) {
      printf("ERROR: matrix order %d should be divisible by # procs %d\n",
             order, Num_procs);
      error = 1; goto ENDOFTESTS;
    }

    if (argc == 5) Tile_order = atoi(*++argv);

    ENDOFTESTS:;
  }
  bail_out(error); 

  /*  Broadcast input data to all ranks */
  MPI_Bcast(&order,      1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&Tile_order, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    printf("Parallel Research Kernels version %s\n", PRKVERSION);
    printf("MPI+SHM Matrix transpose: B = A^T\n");
    printf("Number of ranks      = %d\n", Num_procs);
    printf("Rank group size      = %d\n", group_size);
    printf("Matrix order         = %d\n", order);
    printf("Number of iterations = %d\n", iterations);
    if ((Tile_order > 0) && (Tile_order < order))
       printf("Tile size            = %d\n", Tile_order);
    else  printf("Untiled\n");
#ifndef SYNCHRONOUS
    printf("Non-");
#endif
    printf("Blocking messages\n");
  }

  /* Setup for Shared memory regions */

  /* first divide WORLD in groups of size group_size */
  MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep);
  /* derive from that a SHM communicator */
  MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);
  /* do sanity check, making sure groups did not shrink in second comm split */
  if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666);

  /* a non-positive tile size means no tiling of the local transpose */
  tiling = (Tile_order > 0) && (Tile_order < order);
  bytes = 2 * sizeof(double) * order * order;

/*********************************************************************
** The matrix is broken up into column blocks that are mapped one to a 
** rank.  Each column block is made up of Num_procs smaller square 
** blocks of order block_order.
*********************************************************************/

  Num_groups     = Num_procs/group_size;
  Block_order    = order/Num_groups;

  group_ID       = my_ID/group_size;
  colstart       = Block_order * group_ID;
  Colblock_size  = order * Block_order;
  Block_size     = Block_order * Block_order;

/*********************************************************************
** Create the column block of the test matrix, the column block of the 
** transposed matrix, and workspace (workspace only if #procs>1)
*********************************************************************/

  /* RMA win info */
  MPI_Info_create(&rma_winfo);
  /* This key indicates that passive target RMA will not be used.
   * It is the one info key that MPICH actually uses for optimization. */
  MPI_Info_set(rma_winfo, "no_locks", "true");

  /* only the root of each SHM domain specifies window of nonzero size */
  size_mul = (shm_ID==0);  
  int offset = 32;
  MPI_Aint size= (Colblock_size+offset)*sizeof(double)*size_mul; int disp_unit;
  MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, 
                          (void *) &A_p, &shm_win_A);
  MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_A);
  MPI_Win_shared_query(shm_win_A, MPI_PROC_NULL, &size, &disp_unit, (void *)&A_p);

  if (A_p == NULL){
    printf(" Error allocating space for original matrix on node %d\n",my_ID);
    error = 1;
  }
  bail_out(error);
  A_p += offset;

  /* recompute memory size (overwritten by prior query                 */
  size= (Colblock_size+offset)*sizeof(double)*size_mul; 
  MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, shm_comm, 
                          (void *) &B_p, &shm_win_B);
  MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_B);
  MPI_Win_shared_query(shm_win_B, MPI_PROC_NULL, &size, &disp_unit, (void *)&B_p);
  if (B_p == NULL){
    printf(" Error allocating space for transposed matrix by group %d\n",group_ID);
    error = 1;
  }
  bail_out(error);
  B_p += offset;

  if (Num_groups>1) {

    size = Block_size*sizeof(double)*size_mul;
    MPI_Win_allocate_shared(size, sizeof(double),rma_winfo, shm_comm, 
                           (void *) &Work_in_p, &shm_win_Work_in);
    MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_in);
    MPI_Win_shared_query(shm_win_Work_in, MPI_PROC_NULL, &size, &disp_unit, 
                         (void *)&Work_in_p);
    if (Work_in_p == NULL){
      printf(" Error allocating space for in block by group %d\n",group_ID);
      error = 1;
    }
    bail_out(error);

    /* recompute memory size (overwritten by prior query                 */
    size = Block_size*sizeof(double)*size_mul;
    MPI_Win_allocate_shared(size, sizeof(double), rma_winfo, 
                            shm_comm, (void *) &Work_out_p, &shm_win_Work_out);
    MPI_Win_lock_all(MPI_MODE_NOCHECK,shm_win_Work_out);
    MPI_Win_shared_query(shm_win_Work_out, MPI_PROC_NULL, &size, &disp_unit, 
                         (void *)&Work_out_p);
    if (Work_out_p == NULL){
      printf(" Error allocating space for out block by group %d\n",group_ID);
      error = 1;
    }
    bail_out(error);
  }

  /* Fill the original column matrix                                             */
  istart = 0;  
  int chunk_size = Block_order/group_size;
  if (tiling) {
      for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j+=Tile_order) {
      for (i=0;i<order; i+=Tile_order) 
        for (jt=j; jt<MIN((shm_ID+1)*chunk_size,j+Tile_order); jt++)
          for (it=i; it<MIN(order,i+Tile_order); it++) {
            A(it,jt) = (double) ((double)order*(jt+colstart) + it);
            B(it,jt) = -1.0;
          }
    }
  }
  else {
    for (j=shm_ID*chunk_size;j<(shm_ID+1)*chunk_size;j++) 
      for (i=0;i<order; i++) {
        A(i,j) = (double)((double)order*(j+colstart) + i);
        B(i,j) = -1.0;
      }
  }
  /* NEED A STORE FENCE HERE                                                     */
  MPI_Win_sync(shm_win_A);
  MPI_Win_sync(shm_win_B);
  MPI_Barrier(shm_comm);

  for (iter=0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration */
    if (iter == 1) { 
      MPI_Barrier(MPI_COMM_WORLD);
      local_trans_time = wtime();
    }

    /* do the local transpose                                                    */
    istart = colstart; 
    if (!tiling) {
      for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) {
        for (j=0; j<Block_order; j++) 
              B(j,i) = A(i,j);
	}
    }
    else {
      for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order) {
        for (j=0; j<Block_order; j+=Tile_order) 
          for (it=i; it<MIN(Block_order,i+Tile_order); it++)
            for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
              B(jt,it) = A(it,jt); 
	    }
      }
    }

    for (phase=1; phase<Num_groups; phase++){
      recv_from = ((group_ID + phase             )%Num_groups);
      send_to   = ((group_ID - phase + Num_groups)%Num_groups);

      istart = send_to*Block_order; 
      if (!tiling) {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i++) 
          for (j=0; j<Block_order; j++){
	    Work_out(j,i) = A(i,j);
	  }
      }
      else {
        for (i=shm_ID*chunk_size; i<(shm_ID+1)*chunk_size; i+=Tile_order)
          for (j=0; j<Block_order; j+=Tile_order) 
            for (it=i; it<MIN(Block_order,i+Tile_order); it++)
              for (jt=j; jt<MIN(Block_order,j+Tile_order);jt++) {
                Work_out(jt,it) = A(it,jt); 
	      }
      }

      /* NEED A LOAD/STORE FENCE HERE                                            */
      MPI_Win_sync(shm_win_Work_in);
      MPI_Win_sync(shm_win_Work_out);
      MPI_Barrier(shm_comm);
      if (shm_ID==0) {
#ifndef SYNCHRONOUS  
        /* if we place the Irecv outside this block, it would not be
           protected by a local barrier, which creates a race                    */
        MPI_Irecv(Work_in_p, Block_size, MPI_DOUBLE, 
                  recv_from*group_size, phase, MPI_COMM_WORLD, &recv_req);  
        MPI_Isend(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size,
                  phase, MPI_COMM_WORLD, &send_req);
        MPI_Wait(&recv_req, &status);
        MPI_Wait(&send_req, &status);
#else
        MPI_Sendrecv(Work_out_p, Block_size, MPI_DOUBLE, send_to*group_size, 
                     phase, Work_in_p, Block_size, MPI_DOUBLE, 
  	             recv_from*group_size, phase, MPI_COMM_WORLD, &status);
#endif
      }
      /* NEED A LOAD FENCE HERE                                                  */ 
      MPI_Win_sync(shm_win_Work_in);
      MPI_Win_sync(shm_win_Work_out);
      MPI_Barrier(shm_comm);

      istart = recv_from*Block_order; 
      /* scatter received block to transposed matrix; no need to tile */
      for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
        for (i=0; i<Block_order; i++) 
          B(i,j) = Work_in(i,j);

    }  /* end of phase loop  */
  } /* end of iterations */

  local_trans_time = wtime() - local_trans_time;
  MPI_Reduce(&local_trans_time, &trans_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  abserr = 0.0;
  istart = 0;
  /*  for (j=shm_ID;j<Block_order;j+=group_size) for (i=0;i<order; i++) { */
  for (j=shm_ID*chunk_size; j<(shm_ID+1)*chunk_size; j++)
    for (i=0;i<order; i++) { 
      abserr += ABS(B(i,j) - (double)((double)order*i + j+colstart));
    }

  MPI_Reduce(&abserr, &abserr_tot, 1, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

  if (my_ID == root) {
    if (abserr_tot < epsilon) {
      printf("Solution validates\n");
      avgtime = trans_time/(double)iterations;
      printf("Rate (MB/s): %lf Avg time (s): %lf\n",1.0E-06*bytes/avgtime, avgtime);
#ifdef VERBOSE
      printf("Summed errors: %f \n", abserr_tot);
#endif
    }
    else {
      printf("ERROR: Aggregate squared error %e exceeds threshold %e\n", abserr_tot, epsilon);
      error = 1;
    }
  }

  bail_out(error);

  MPI_Win_unlock_all(shm_win_A);
  MPI_Win_unlock_all(shm_win_B);

  MPI_Win_free(&shm_win_A);
  MPI_Win_free(&shm_win_B);
  if (Num_groups>1) {
      MPI_Win_unlock_all(shm_win_Work_in);
      MPI_Win_unlock_all(shm_win_Work_out);
      MPI_Win_free(&shm_win_Work_in);
      MPI_Win_free(&shm_win_Work_out);
  }

  MPI_Info_free(&rma_winfo);

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
コード例 #26
0
ファイル: main.c プロジェクト: edging1218/channel1L
int main(int argc, char *argv[]){
        MPI_Init(&argc, &argv);
        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shmcomm);
        MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
        MPI_Comm_rank(MPI_COMM_WORLD, &myid);

	int i, j, k, n, l, indx, signal;
	bool flag;
	double deltat;

	double time_spend;
	double begin, end;

	FILE* energy;
	FILE* grid;

	begin = MPI_Wtime();
	//read in parameters
	if(!read_param()){
		MPI_Comm_free(&shmcomm);
		MPI_Finalize();
		return 1;
	}
	
	flag = true;
	dE = 1;
	el_old = 1;
	cycle = 0;
	deltat = (0.1 - dt) / increment;

	S = 0.25 * (1 + 3 * sqrt(1 - 8 / (3 * U)));
	//	printf("Theoretical value is %lf.\n", third * (1 - third * U) * S * S - 2 * third * third * third * S * S * S * U + U / 9 * S * S * S * S );

	if(myid == root){
		
		printf("S is %lf.\n\n", S);
	
		//define droplet and boundary, introduce particles, initialize qtensor;
		if(!initial()){
			flag = false;
		}		
		energy = fopen("energy.out", "w");
		fprintf(energy,"cycle\tEnergy_diff\tEnergy_ldg\tEnergy_el\tEnergy_ch\tEnergy_surf\tEnergy_tot\n");
		fclose(energy);

		grid = fopen("grid.bin", "wb");
		l = 0;
		for(l = 0; l < tot; l++){
			if(boundary[l] || nboundary[l])	signal = 1;
			else if(drop[l]) 	signal = 0;			
			else signal = -1;
			fwrite(&signal, sizeof(int), 1, grid);
		}
		fclose(grid);	
		
	}
	
	//For all infomation initialized in root processor, scatter them to all other processors.
	if(!scatter()){
		flag = false;
		return 1;
	}		

	//Evolution
	while(flag){
		//Every 1000 steps calculate energies.
		if(cycle % 1000 == 0){
			free_energy();
			if(fabs(dE) < accuracy){
				flag = false;
			}
		}

		if(cycle % 10000 == 0){ 
			//Every 10000 steps check the trace of Qtensor
			if(myid == root){	
				for(i = 0; i < droplet; i++){
					if(!checktr(&q[i * 6])){
						flag = false;
						printf("Error in the trace of q.\n");
					}
					if(flag == false)	break;
				}
				//print output file
				output();
			}	
			MPI_Bcast(&flag, 1, MPI_BYTE, root, MPI_COMM_WORLD);	
		}

		//Wait until all the processors are ready and relax the system, first bulk and then boundary.
		MPI_Barrier(MPI_COMM_WORLD);
		MPI_Win_fence(0, win);
		if(flag) relax_bulk();

		MPI_Barrier(MPI_COMM_WORLD);
		MPI_Win_fence(0, win);
		if(flag)	relax_surf();

		if(dt < 0.1){
			dt += deltat;
			if(dt >= 0.1)	dt = 0.1;
		}

		cycle ++;
		MPI_Barrier(MPI_COMM_WORLD);
		MPI_Win_fence(0, win);
		
	}

	free_energy();

	end = MPI_Wtime();

	if(myid == root){
		output();

		//Calculate time used.
		time_spend = (double)(end - begin) / 60.0;

		energy = fopen("energy.out", "a");
		if(time_spend < 60){
			fprintf(energy, "\nTime used:	%lf min.\n", time_spend);
			printf("\nTime used:	%lf min.\n", time_spend);
		}
		else{
			fprintf(energy, "\nTime used:	%lf h.\n", time_spend / 60.0);
			printf("\nTime used:	%lf h.\n", time_spend / 60.0);
		}
		fclose(energy);	
	}

	//deallocate dynamic arrays
	free_q();
        MPI_Win_free(&win);
        MPI_Comm_free(&shmcomm);
        MPI_Finalize();

	return 0;
}
コード例 #27
0
ファイル: p2p.c プロジェクト: LaHaine/ohpc
int main(int argc, char ** argv)
{
  int    my_ID;         /* rank                                                  */
  int    root;
  int    m, n;          /* grid dimensions                                       */
  double local_pipeline_time, /* timing parameters                               */
         pipeline_time,
         avgtime;
  double epsilon = 1.e-8; /* error tolerance                                     */
  double corner_val;    /* verification value at top right corner of grid        */
  int    i, j, iter, ID;/* dummies                                               */
  int    iterations;    /* number of times to run the pipeline algorithm         */
  int    *start, *end;  /* starts and ends of grid slices                        */
  int    segment_size;
  int    error=0;       /* error flag                                            */
  int    Num_procs;     /* Number of ranks                                       */
  double *vector;       /* array holding grid values                             */
  long   total_length;  /* total required length to store grid values            */
  MPI_Status status;    /* completion status of message                          */
  MPI_Group shm_group, origin_group, target_group;
  int origin_ranks[1], target_ranks[1];
  MPI_Aint nbr_segment_size;
  MPI_Win shm_win;      /* Shared Memory window object                           */
  MPI_Info rma_winfo;   /* info for window                                       */
  MPI_Comm shm_comm;    /* Shared Memory Communicator                            */
  int shm_procs;        /* # of ranks in shared domain                           */
  int shm_ID;           /* MPI rank                                              */
  int source_disp;
  double *source_ptr;
  int p2pbuf;
  int width, nbr_width;

/*********************************************************************************
** Initialize the MPI environment
**********************************************************************************/
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_ID);
  MPI_Comm_size(MPI_COMM_WORLD, &Num_procs);

/* we set root equal to highest rank, because this is also the rank that reports 
   on the verification value                                                     */
  root = Num_procs-1;

  /* Setup for Shared memory regions */
  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm);
  MPI_Comm_rank(shm_comm, &shm_ID);
  MPI_Comm_size(shm_comm, &shm_procs);

/*********************************************************************
** process, test and broadcast input parameter
*********************************************************************/

  if (my_ID == root){
    if (argc != 4){
      printf("Usage: %s  <#iterations> <1st array dimension> <2nd array dimension>\n",
             *argv);
      error = 1;
      goto ENDOFTESTS;
    }

    iterations = atoi(*++argv);
    if (iterations < 1){
      printf("ERROR: iterations must be >= 1 : %d \n",iterations);
      error = 1;
      goto ENDOFTESTS;
    }

    m = atoi(*++argv);
    n = atoi(*++argv);
    if (m < 1 || n < 1){
      printf("ERROR: grid dimensions must be positive: %d, %d \n", m, n);
      error = 1;
      goto ENDOFTESTS;
    }

    if (m<Num_procs) {
      printf("ERROR: First grid dimension %d smaller than number of ranks %d\n",
             m, Num_procs);
      error = 1;
      goto ENDOFTESTS;
    }

    ENDOFTESTS:;
  }
  bail_out(error);

  if (my_ID == root) {
    printf("MPI+SHM pipeline execution on 2D grid\n");
    printf("Number of ranks                = %i\n",Num_procs);
    printf("Grid sizes                     = %d, %d\n", m, n);
    printf("Number of iterations           = %d\n", iterations);
#ifdef VERBOSE
    printf("Synchronizations/iteration     = %d\n", (Num_procs-1)*(n-1));
#endif
  }

  /* Broadcast benchmark data to all ranks */
  MPI_Bcast(&m, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&n, 1, MPI_INT, root, MPI_COMM_WORLD);
  MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD);

  start = (int *) malloc(2*Num_procs*sizeof(int));
  if (!start) {
    printf("ERROR: Could not allocate space for array of slice boundaries\n");
    exit(EXIT_FAILURE);
  }
  end = start + Num_procs;
  start[0] = 0;
  for (ID=0; ID<Num_procs; ID++) {
    segment_size = m/Num_procs;
    if (ID < (m%Num_procs)) segment_size++;
    if (ID>0) start[ID] = end[ID-1]+1;
    end[ID] = start[ID]+segment_size-1;
  }

  /* now set segment_size to the value needed by the calling rank               */
  segment_size = end[my_ID] - start[my_ID] + 1;

  /* RMA win info */
  MPI_Info_create(&rma_winfo);
  /* This key indicates that passive target RMA will not be used.
   * It is the one info key that MPICH actually uses for optimization. */
  MPI_Info_set(rma_winfo, "no_locks", "true");

  /* total_length takes into account one ghost cell on left side of segment     */
  if (shm_ID == 0) {
    total_length = ((end[my_ID]-start[my_ID]+1)+1)*n;
    width = segment_size+1;
  } else {
    total_length = (end[my_ID]-start[my_ID]+1)*n;
    width = segment_size;
  }

  MPI_Win_allocate_shared(total_length*sizeof(double), sizeof(double), 
                          rma_winfo, shm_comm, (void *) &vector, &shm_win);
  if (vector == NULL) {
    printf("Could not allocate space for grid slice of %d by %d points",
           segment_size, n);
    printf(" on rank %d\n", my_ID);
    error = 1;
  }
  bail_out(error);

  /* Get left neighbor base address */
  if (shm_ID > 0) {
    MPI_Win_shared_query(shm_win, shm_ID-1, &nbr_segment_size, &source_disp, &source_ptr);
    nbr_segment_size = end[my_ID-1] - start[my_ID-1] + 1;
    nbr_width = nbr_segment_size;
  }

  /* clear the array                                                             */
  for (j=0; j<n; j++) for (i=start[my_ID]-1; i<=end[my_ID]; i++) {
    ARRAY(i-start[my_ID],j) = 0.0;
  }
  /* set boundary values (bottom and left side of grid                           */
  if (my_ID==0) for (j=0; j<n; j++) ARRAY(0,j) = (double) j;
  for (i=start[my_ID]-1; i<=end[my_ID]; i++) ARRAY(i-start[my_ID],0) = (double) i;

  /* redefine start and end for calling rank to reflect local indices            */
  if (my_ID==0) start[my_ID] = 1;
  else          start[my_ID] = 0;
  end[my_ID] = segment_size-1;

  for (iter=0; iter<=iterations; iter++) {

    /* start timer after a warmup iteration */
    if (iter == 1) {
      MPI_Barrier(MPI_COMM_WORLD);
      local_pipeline_time = wtime();
    }

    /* execute pipeline algorithm for grid lines 1 through n-1 (skip bottom line) */
    for (j=1; j<n; j++) {

      /* if I am not at the left boundary, I need to wait for my left neighbor to
         send data                                                                */
      if (my_ID > 0) {
	if (shm_ID > 0) {
	  MPI_Recv(&p2pbuf, 0, MPI_INT, shm_ID-1, 1, shm_comm, &status);
	} else {
	  MPI_Recv(&(ARRAY(start[my_ID]-1,j)), 1, MPI_DOUBLE,
		   my_ID-1, j, MPI_COMM_WORLD, &status);
	}
      }

      i = start[my_ID];

      if (shm_ID != 0) {
	ARRAY(i,j) = source_ptr[NBR_INDEX(end[my_ID],j)] + ARRAY(i,j-1) - source_ptr[NBR_INDEX(end[my_ID],j-1)];
	i++;
      }

      for (; i<= end[my_ID]; i++) {
	ARRAY(i,j) = ARRAY(i-1,j) + ARRAY(i,j-1) - ARRAY(i-1,j-1);
      }

      /* if I am not on the right boundary, send data to my right neighbor        */
      if (my_ID != Num_procs-1) {
	if (shm_ID != shm_procs-1) {
	  MPI_Send(&p2pbuf, 0, MPI_INT, shm_ID+1, 1, shm_comm);
	} else {
	  MPI_Send(&(ARRAY(end[my_ID],j)), 1, MPI_DOUBLE,
		   my_ID+1, j, MPI_COMM_WORLD);
	}
      }
    }

    /* copy top right corner value to bottom left corner to create dependency      */
    if (Num_procs >1) {
      if (my_ID==root) {
        corner_val = -ARRAY(end[my_ID],n-1);
        MPI_Send(&corner_val,1,MPI_DOUBLE,0,888,MPI_COMM_WORLD);
      }
      if (my_ID==0) {
        MPI_Recv(&(ARRAY(0,0)),1,MPI_DOUBLE,root,888,MPI_COMM_WORLD,&status);
      }
    }
    else ARRAY(0,0)= -ARRAY(end[my_ID],n-1);

  }

  local_pipeline_time = wtime() - local_pipeline_time;
  MPI_Reduce(&local_pipeline_time, &pipeline_time, 1, MPI_DOUBLE, MPI_MAX, root,
             MPI_COMM_WORLD);

  /*******************************************************************************
  ** Analyze and output results.
  ********************************************************************************/

  /* verify correctness, using top right value                                     */
  corner_val = (double) ((iterations+1)*(m+n-2));
  if (my_ID == root) {
    if (abs(ARRAY(end[my_ID],n-1)-corner_val)/corner_val >= epsilon) {
      printf("ERROR: checksum %lf does not match verification value %lf\n",
             ARRAY(end[my_ID],n-1), corner_val);
      error = 1;
    }
  }
  bail_out(error);

  if (my_ID == root) {
    avgtime = pipeline_time/iterations;
#ifdef VERBOSE
    printf("Solution validates; verification value = %lf\n", corner_val);
    printf("Point-to-point synchronizations/s: %lf\n",
           ((float)((n-1)*(Num_procs-1)))/(avgtime));
#else
    printf("Solution validates\n");
#endif
    printf("Rate (MFlops/s): %lf Avg time (s): %lf\n",
           1.0E-06 * 2 * ((double)((m-1)*(n-1)))/avgtime, avgtime);
  }

  MPI_Finalize();
  exit(EXIT_SUCCESS);

}  /* end of main */
コード例 #28
0
/// Set up MPI communication. This should be called in all processes
/// before doing anything with this object, either directly or through
/// the constructor that calls it.
void MPIConnection::init( int * argc_p, char ** argv_p[] ) {
  int initialized = 0;
  MPI_CHECK( MPI_Initialized( &initialized ) );

  if( !initialized ) {
    //
    // MPI Boilerplate derived from Grappa
    //
  
    MPI_CHECK( MPI_Init( argc_p, argv_p ) ); 

    // get locale-local MPI communicator
    MPI_CHECK( MPI_Comm_split_type( MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &locale_communicator_ ) );
    MPI_CHECK( MPI_Comm_set_errhandler( locale_communicator_, MPI_ERRORS_RETURN ) );
    MPI_CHECK( MPI_Comm_rank( locale_communicator_, &locale_rank_ ) );
    MPI_CHECK( MPI_Comm_size( locale_communicator_, &locale_size_ ) );

    // get count of locales
    int32_t localesint = locale_rank == 0; // count one per locale and broadcast
    MPI_CHECK( MPI_Allreduce( MPI_IN_PLACE, &localesint, 1, MPI_INT32_T,
                              MPI_SUM, MPI_COMM_WORLD ) );
    locales_ = localesint;

    // get my locale
    int32_t mylocaleint = locale_rank == 0;  // count one per locale and sum
    MPI_CHECK( MPI_Scan( MPI_IN_PLACE, &mylocaleint, 1, MPI_INT32_T,
                         MPI_SUM, MPI_COMM_WORLD ) );
    // copy to all cores in locale
    MPI_CHECK( MPI_Bcast( &mylocaleint, 1, MPI_INT32_T,
                          0, locale_communicator_ ) );
    mylocaleint -= 1; // make zero-indexed
    locale_ = mylocaleint;
    
    // make new communicator with ranks laid out so that nodes hold adjacent ranks
    MPI_CHECK( MPI_Comm_split( MPI_COMM_WORLD, 0, mylocaleint, &main_communicator_ ) );
    MPI_CHECK( MPI_Comm_set_errhandler( main_communicator_, MPI_ERRORS_RETURN ) );
    int main_mycoreint = -1;
    int main_coresint = -1;
    MPI_CHECK( MPI_Comm_rank( main_communicator_, &main_mycoreint ) );
    MPI_CHECK( MPI_Comm_size( main_communicator_, &main_coresint ) );
    rank_ = main_mycoreint;
    size_ = main_coresint;
    
    // verify locale numbering is consistent with locales
    int32_t localemin = std::numeric_limits<int32_t>::max();
    int32_t localemax = std::numeric_limits<int32_t>::min();
    MPI_CHECK( MPI_Reduce( &mylocaleint, &localemin, 1, MPI_INT32_T,
                           MPI_MIN, 0, locale_communicator_ ) );
    MPI_CHECK( MPI_Reduce( &mylocaleint, &localemax, 1, MPI_INT32_T,
                           MPI_MAX, 0, locale_communicator_ ) );
    if( (0 == locale_rank_) && (localemin != localemax) ) {
      std::cerr << "Locale ID is not consistent across locale!\n";
      exit(1);
    }

    // verify locale core count is the same across job
    int32_t locale_coresmin = std::numeric_limits<int32_t>::max();
    int32_t locale_coresmax = std::numeric_limits<int32_t>::min();
    MPI_CHECK( MPI_Reduce( &locale_size_, &locale_coresmin, 1, MPI_INT32_T,
                           MPI_MIN, 0, main_communicator_ ) );
    MPI_CHECK( MPI_Reduce( &locale_size_, &locale_coresmax, 1, MPI_INT32_T,
                           MPI_MAX, 0, main_communicator_ ) );
    if( 0 == rank_ && ( locale_coresmin != locale_coresmax ) ) {
      std::cerr << "Number of cores per locale is not the same across job!\n";
      exit(1);
    }
  
    barrier();
  }
}