예제 #1
0
void
mmm2d_comm_init(mmm2d_comm_struct *comm, MPI_Comm communicator) {
  /* store the original communicator */
  comm->mpicomm_orig = communicator;
  MPI_Comm_size(communicator, &comm->size);

  /* Test whether the communicator is cartesian and correct dimensionality */
  fcs_int comm_is_cart = 0;
  fcs_int status;
  MPI_Topo_test(communicator, &status);
  if (status == MPI_CART) {
    /* Communicator is cartesian, so test dimensionality */
    fcs_int ndims;
    MPI_Cartdim_get(communicator, &ndims);
    if (ndims == 3) {
      /* Correct dimensionality, so get grid and test periodicity */
      fcs_int periodicity[3];
      MPI_Cart_get(communicator, 3, comm->node_grid, periodicity, comm->node_pos);
      if (periodicity[0] && periodicity[1] && periodicity[2]) {
        /* If periodicity is correct, we can just use this communicator */
        comm->mpicomm = communicator;
        /* get the rank */
        MPI_Comm_rank(communicator, &comm->rank);
        comm_is_cart = 1;
      }
    }
  }

  /* otherwise, we have to set up the cartesian communicator */
  if (!comm_is_cart) {

    comm->node_grid[0] = 0.0;
    comm->node_grid[1] = 0.0;
    comm->node_grid[2] = 0.0;

    /* compute node grid */
    MPI_Dims_create(comm->size, 3, comm->node_grid);

    /* create communicator */
    fcs_int periodicity[3] = {1, 1, 0};
    MPI_Cart_create(comm->mpicomm_orig, 3, comm->node_grid, periodicity, 1, &comm->mpicomm);

    /* get the rank */
    MPI_Comm_rank(communicator, &comm->rank);
    /* get node pos */
    MPI_Cart_coords(comm->mpicomm, comm->rank, 3, comm->node_pos);
  }
}
예제 #2
0
void initialize(field * temperature1, field * temperature2,
                parallel_data * parallel)
{
    int i, j;

    int dims[2], coords[2], periods[2];

    // Allocate also ghost layers
    temperature1->data =
        malloc_2d(temperature1->nx + 2, temperature1->ny + 2);
    temperature2->data =
        malloc_2d(temperature2->nx + 2, temperature2->ny + 2);

    // Initialize to zero
    memset(temperature1->data[0], 0.0,
           (temperature1->nx + 2) * (temperature1->ny + 2)
           * sizeof(double));

    MPI_Cart_get(parallel->comm, 2, dims, periods, coords);

    // Left boundary
    if (coords[1] == 0)
        for (i = 0; i < temperature1->nx + 2; i++)
            temperature1->data[i][0] = 30.0;

    // Upper boundary
    if (coords[0] == 0)
        for (j = 0; j < temperature1->ny + 2; j++)
            temperature1->data[0][j] = 15.0;

    // Right boundary
    if (coords[1] == dims[1] - 1)
        for (i = 0; i < temperature1->nx + 2; i++)
            temperature1->data[i][temperature1->ny + 1] = -10.0;

    // Lower boundary
    if (coords[0] == dims[0] - 1)
        for (j = 0; j < temperature1->ny + 2; j++)
            temperature1->data[temperature1->nx + 1][j] = -25.0;

    copy_field(temperature1, temperature2);

}
예제 #3
0
파일: init.c 프로젝트: fweik/scafacos
static void comm_get_periodicity(
    MPI_Comm comm, fcs_int *periodicity
    )
{
  int dims[3], periods[3], coords[3];
 
  /* default: no periodicity given */ 
  for(int t=0; t<3; t++)
    periodicity[t] = -1;

  if( !comm_is_cart_3d(comm) )
    return;
  
  /* for 3d cart comm use periodcity of comm */ 
  MPI_Cart_get(comm, 3,
      dims, periods, coords);
  for(int t=0; t<3; t++)
    periodicity[t] = periods[t];
}
예제 #4
0
int dfft_get_local_size_t(int N0, int N1, int tuple, int * isize, int * istart,
		MPI_Comm c_comm) {
	int nprocs, procid;
	MPI_Comm_rank(c_comm, &procid);

	int coords[2], np[2], periods[2];
	MPI_Cart_get(c_comm, 2, np, periods, coords);
	isize[2] = tuple;
	isize[0] = ceil(N0 / (double) np[0]);
	isize[1] = ceil(N1 / (double) np[1]);

	istart[0] = isize[0] * (coords[0]);
	istart[1] = isize[1] * (coords[1]);
	istart[2] = 0;

	if ((N0 - isize[0] * coords[0]) < isize[0]) {
		isize[0] = N0 - isize[0] * coords[0];
		isize[0] *= (int) isize[0] > 0;
		istart[0] = N0 - isize[0];
	}
	if ((N1 - isize[1] * coords[1]) < isize[1]) {
		isize[1] = N1 - isize[1] * coords[1];
		isize[1] *= (int) isize[1] > 0;
		istart[1] = N1 - isize[1];
	}
#ifdef VERBOSE2
	if (VERBOSE >= 2) {
		for (int r = 0; r < np[0]; r++)
			for (int c = 0; c < np[1]; c++) {
				if ((coords[0] == r) && (coords[1] == c))
					std::cout << coords[0] << "," << coords[1] << " isize[0]= "
							<< isize[0] << " isize[1]= " << isize[1]
							<< " isize[2]= " << isize[2] << " istart[0]= "
							<< istart[0] << " istart[1]= " << istart[1]
							<< " istart[2]= " << istart[2] << std::endl;
			}
	}
#endif
	int alloc_local = isize[0] * isize[1] * isize[2] * sizeof(T);

	return alloc_local;
}
예제 #5
0
static void set_data_2D ( double **data, int rank, int *dim, int hwidth, MPI_Comm cart_comm ) 
{
    int i, j;
    int coords[2];
    int dims[2]; //size of each dimension
    int period[2];


    //Get the coordinate of the process
    //MPI_Cart_coords(cart_comm, rank, 2, coords);
    MPI_Cart_get(cart_comm, 2, dims, period, coords);

    for (i=0; i<dim[0]; i++ ) {
        for (j=0; j<hwidth; j++ ){
            data[i][j] =-1;
        }
        for (j=dim[1]-hwidth; j<dim[1]; j++ ) {
            data[i][j] =-1;
        }
    }

    for ( j=0; j<dim[1]; j++ ) {
        for ( i=0; i<hwidth; i++ ) {
            data[i][j]=-1;
        }
        for ( i=dim[0]-hwidth; i<dim[0]; i++ ) {
            data[i][j]=-1;
        }
    }

    for ( i=hwidth; i<dim[0]-hwidth; i++) {
        for (j=hwidth; j<dim[1]-hwidth; j++ ){
            data[i][j] = (coords[0] * (dim[0]-hwidth*2) + (i-hwidth) ) + (dims[0] * (dim[0] - hwidth*2)) * ((dim[1] - hwidth*2) * coords[1] + (j-hwidth));

        }
    }


    return;
}
예제 #6
0
static int check_data_2D ( double **data, int rank, int *dim, int hwidth, 
        int *neighbors, MPI_Comm cart_comm)
{
    int i, j, lres=1, gres, cumres = 1;
    double should_be;
    int coords[2], n_coords[2], c_coords[2];
    int cart_dims[2]; //size of each dimension
    int period[2];

    MPI_Cart_get(cart_comm, 2, cart_dims, period, coords); 

    //Up HALO Cell
    if(neighbors[0] != MPI_PROC_NULL){
        MPI_Cart_coords (cart_comm, neighbors[0], 2, n_coords);
    }
    for ( j = hwidth; j < dim[1] - hwidth; j++ ) {
        for ( i= dim[0] - hwidth*2; i < dim[0]-hwidth ; i++ ) {
            should_be = calc_entry (i, j, neighbors[0] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); 
            check_entry2D ( data, i-(dim[0]-hwidth*2), j, should_be, "out1", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Down HALO Cell 
    if(neighbors[1] != MPI_PROC_NULL){
        MPI_Cart_coords (cart_comm, neighbors[1], 2, n_coords);
    }
    for ( j = hwidth; j < dim[1] - hwidth; j++ ) {
        for ( i= hwidth; i < hwidth * 2 ; i++ ) {
            should_be = calc_entry (i, j, neighbors[1] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); 
            check_entry2D ( data, i+(dim[0]-hwidth*2), j, should_be, "out1", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Left HALO Cell  
    if(neighbors[2]!=MPI_PROC_NULL){
        MPI_Cart_coords (cart_comm, neighbors[2], 2, n_coords);
    }
    for ( i = hwidth; i < dim[0] - hwidth; i++ ) {
        for ( j= dim[1] - hwidth*2; j < dim[1]-hwidth ; j++ ) {
            should_be = calc_entry (i, j, neighbors[2] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); 
            check_entry2D ( data, i, j-(dim[1]-hwidth*2), should_be, "out2", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Right HALO Cell
    if(neighbors[3]!=MPI_PROC_NULL){
        MPI_Cart_coords (cart_comm, neighbors[3], 2, n_coords);
    }
    for ( i = hwidth; i < dim[0] - hwidth; i++ ) {
        for ( j= hwidth; j < hwidth * 2 ; j++ ) { 
            should_be = calc_entry (i, j, neighbors[3] != MPI_PROC_NULL, dim, cart_dims, n_coords, hwidth ); 
            check_entry2D ( data, i, j+(dim[1]-hwidth*2), should_be, "out2", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Inside
    for ( i=hwidth; i<dim[0]-hwidth; i++) {
        for (j=hwidth; j<dim[1]-hwidth; j++ ){ 
            should_be = calc_entry (i, j, 1, dim, cart_dims, coords, hwidth ); 
            check_entry2D ( data, i, j, should_be, "inside", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Up-Left Corner
    if((neighbors[0]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL)){
        MPI_Cart_coords (cart_comm, neighbors[0], 2, n_coords);
        c_coords[0] = n_coords[0];
        MPI_Cart_coords (cart_comm, neighbors[2], 2, n_coords);
        c_coords[1] = n_coords[1];
    }
    for ( i= dim[0] - hwidth*2; i < dim[0]-hwidth ; i++ ) {
        for ( j= dim[1] - hwidth*2; j < dim[1]-hwidth ; j++ ) {
            should_be = calc_entry (i, j, (neighbors[0]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL), 
                    dim, cart_dims, c_coords, hwidth ); 
            check_entry2D ( data, i-(dim[0]-hwidth*2), j-(dim[1]-hwidth*2), should_be, "corner1", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Up-right Corner
    if ((neighbors[0]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL)){
        MPI_Cart_coords (cart_comm, neighbors[0], 2, n_coords);
        c_coords[0] = n_coords[0];
        MPI_Cart_coords (cart_comm, neighbors[3], 2, n_coords);
        c_coords[1] = n_coords[1];
    }
    for ( i= dim[0] - hwidth*2; i < dim[0]-hwidth ; i++ ) {
        for ( j= hwidth ; j < hwidth * 2 ; j++ ) {
            should_be = calc_entry (i, j, (neighbors[0]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL), 
                    dim, cart_dims, c_coords, hwidth ); 
            check_entry2D ( data, i-(dim[0]-hwidth*2), j+(dim[1]-hwidth*2), should_be, "corner2", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Down-left Corner
    if((neighbors[1]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL)){
        MPI_Cart_coords (cart_comm, neighbors[1], 2, n_coords);
        c_coords[0] = n_coords[0];
        MPI_Cart_coords (cart_comm, neighbors[2], 2, n_coords);
        c_coords[1] = n_coords[1];
    }
    for ( i= hwidth; i < hwidth * 2; i++ ) {
        for ( j= dim[1] - hwidth*2; j < dim[1]-hwidth ; j++ ) {
            should_be = calc_entry (i, j, (neighbors[1]!=MPI_PROC_NULL) && (neighbors[2]!=MPI_PROC_NULL), 
                    dim, cart_dims, c_coords, hwidth ); 
            check_entry2D ( data, i+(dim[0]-hwidth*2), j-(dim[1]-hwidth*2), should_be, "corner3", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    //Down-right Corner
    if((neighbors[1]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL)){
        MPI_Cart_coords (cart_comm, neighbors[1], 2, n_coords);
        c_coords[0] = n_coords[0];
        MPI_Cart_coords (cart_comm, neighbors[3], 2, n_coords);
        c_coords[1] = n_coords[1];
    }
    for ( i= hwidth; i < hwidth * 2; i++ ) {
        for ( j= hwidth ; j < hwidth * 2 ; j++ ) {
            should_be = calc_entry (i, j, (neighbors[1]!=MPI_PROC_NULL) && (neighbors[3]!=MPI_PROC_NULL), 
                    dim, cart_dims, c_coords, hwidth ); 
            check_entry2D ( data, i+(dim[0]-hwidth*2), j+(dim[1]-hwidth*2), should_be, "corner4", &lres );
            if ( lres == 0 ) cumres = 0;                 
        }
    }

    MPI_Allreduce ( &cumres, &gres, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD );
    if ( gres != 1 ) {
        return 0; 
    }

    return 1;
}
예제 #7
0
파일: cart.c 프로젝트: Shurakai/SimGrid
int main( int argc, char **argv )
{
    int              rank, size, i;
    int              errors=0;
    int              dims[NUM_DIMS];
    int              periods[NUM_DIMS];
    int              coords[NUM_DIMS];
    int              new_coords[NUM_DIMS];
    int              reorder = 1;
    MPI_Comm         comm_temp, comm_cart, new_comm;
    int              topo_status;
    int              ndims;
    int              new_rank;
    int              remain_dims[NUM_DIMS];
    int              newnewrank;

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    /* Clear dims array and get dims for topology */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Dims_create ( size, NUM_DIMS, dims );

    /* Make a new communicator with a topology */
    MPI_Cart_create ( MPI_COMM_WORLD, 2, dims, periods, reorder, &comm_temp );
    MPI_Comm_dup ( comm_temp, &comm_cart );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( comm_cart, &topo_status );
    if (topo_status != MPI_CART) {
	printf( "topo_status of duped comm is not MPI_CART\n" );
	errors++;
    }

    /* How many dims do we have? */
    MPI_Cartdim_get( comm_cart, &ndims );
    if ( ndims != NUM_DIMS ) {
	printf( "Number of dims of duped comm (%d) should be %d\n", 
		ndims, NUM_DIMS );
	errors++;
    }

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( comm_cart, NUM_DIMS, dims, periods, coords );

    /* Does the mapping from coords to rank work? */
    MPI_Cart_rank ( comm_cart, coords, &new_rank );
    if ( new_rank != rank ) {
	printf( "New rank of duped comm (%d) != old rank (%d)\n", 
		new_rank, rank );
	errors++;
    }

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( comm_cart, rank, NUM_DIMS, new_coords );
    for (i=0;i<NUM_DIMS;i++) 
	if ( coords[i] != new_coords[i] ) {
	    printf( "Old coords[%d] of duped comm (%d) != new_coords (%d)\n", 
		    i, coords[i], new_coords[i] );
	    errors++;
	}

    /* Let's shift in each dimension and see how it works!   */
    /* Because it's late and I'm tired, I'm not making this  */
    /* automatically test itself.                            */
    for (i=0;i<NUM_DIMS;i++) {
      int source, dest;
      MPI_Cart_shift(comm_cart, i, 1, &source, &dest);
#ifdef VERBOSE      
      printf ("[%d] Shifting %d in the %d dimension\n",rank,1,i);
      printf ("[%d]    source = %d  dest = %d\n",rank,source,dest); 
#endif
    }

    /* Subdivide */
    remain_dims[0] = 0; 
    for (i=1; i<NUM_DIMS; i++) remain_dims[i] = 1;
    MPI_Cart_sub ( comm_cart, remain_dims, &new_comm );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( new_comm, &topo_status );
    if (topo_status != MPI_CART) {
	printf( "topo_status of cartsub comm is not MPI_CART\n" );
	errors++;
    }

    /* How many dims do we have? */
    MPI_Cartdim_get( new_comm, &ndims );
    if ( ndims != NUM_DIMS-1 ) {
	printf( "Number of dims of cartsub comm (%d) should be %d\n", 
		ndims, NUM_DIMS-1 );
	errors++;
    }

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS-1;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( new_comm, ndims, dims, periods, coords );
    
    /* Does the mapping from coords to rank work? */
    MPI_Comm_rank ( new_comm, &newnewrank );
    MPI_Cart_rank ( new_comm, coords, &new_rank );
    if ( new_rank != newnewrank ) {
	printf( "New rank of cartsub comm (%d) != old rank (%d)\n", 
		new_rank, newnewrank );
	errors++;
    }

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( new_comm, new_rank, NUM_DIMS -1, new_coords );
    for (i=0;i<NUM_DIMS-1;i++) 
	if ( coords[i] != new_coords[i] ) {
	    printf( "Old coords[%d] of cartsub comm (%d) != new_coords (%d)\n", 
		    i, coords[i], new_coords[i] );
	    errors++;
	}

    /* We're at the end */
    MPI_Comm_free( &new_comm );
    MPI_Comm_free( &comm_temp );
    MPI_Comm_free( &comm_cart );
    Test_Waitforall( );
    if (errors) printf( "[%d] done with %d ERRORS!\n", rank,errors );
    MPI_Finalize();
    return 0;
}
예제 #8
0
파일: cart.c 프로젝트: Katetc/cime
FC_FUNC( mpi_cart_get , MPI_CART_GET )
         (int * comm, int * maxdims, int * dims,
          int * periods, int * coords, int * ierr)
{
  *ierr = MPI_Cart_get(*comm, *maxdims, dims, periods, coords);
}
예제 #9
0
파일: scatterv.c 프로젝트: NexMirror/MPICH
int main(int argc, char **argv)
{
    int rank, size, myrow, mycol, nx, ny, stride, cnt, i, j, errs, errs_in_place, tot_errs;
    double *sendbuf, *recvbuf;
    MPI_Datatype vec, block, types[2];
    MPI_Aint displs[2];
    int *scdispls;
    int blens[2];
    MPI_Comm comm2d;
    int dims[2], periods[2], coords[2], lcoords[2];
    int *sendcounts;


    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    /* Get a 2-d decomposition of the processes */
    dims[0] = 0;
    dims[1] = 0;
    MPI_Dims_create(size, 2, dims);
    periods[0] = 0;
    periods[1] = 0;
    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm2d);
    MPI_Cart_get(comm2d, 2, dims, periods, coords);
    myrow = coords[0];
    mycol = coords[1];
/*
    if (rank == 0)
        printf("Decomposition is [%d x %d]\n", dims[0], dims[1]);
*/

    /* Get the size of the matrix */
    nx = 10;
    ny = 8;
    stride = nx * dims[0];

    recvbuf = (double *) malloc(nx * ny * sizeof(double));
    if (!recvbuf) {
        MPI_Abort(MPI_COMM_WORLD, 1);
    }
    sendbuf = 0;
    if (myrow == 0 && mycol == 0) {
        sendbuf = (double *) malloc(nx * ny * size * sizeof(double));
        if (!sendbuf) {
            MPI_Abort(MPI_COMM_WORLD, 1);
        }
    }
    sendcounts = (int *) malloc(size * sizeof(int));
    scdispls = (int *) malloc(size * sizeof(int));

    MPI_Type_vector(ny, nx, stride, MPI_DOUBLE, &vec);
    blens[0] = 1;
    blens[1] = 1;
    types[0] = vec;
    types[1] = MPI_UB;
    displs[0] = 0;
    displs[1] = nx * sizeof(double);

    MPI_Type_struct(2, blens, displs, types, &block);
    MPI_Type_free(&vec);
    MPI_Type_commit(&block);

    /* Set up the transfer */
    cnt = 0;
    for (i = 0; i < dims[1]; i++) {
        for (j = 0; j < dims[0]; j++) {
            sendcounts[cnt] = 1;
            /* Using Cart_coords makes sure that ranks (used by
             * sendrecv) matches the cartesian coordinates (used to
             * set data in the matrix) */
            MPI_Cart_coords(comm2d, cnt, 2, lcoords);
            scdispls[cnt++] = lcoords[0] + lcoords[1] * (dims[0] * ny);
        }
    }

    SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]);
    MPI_Scatterv(sendbuf, sendcounts, scdispls, block, recvbuf, nx * ny, MPI_DOUBLE, 0, comm2d);
    if ((errs = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], 0))) {
        fprintf(stdout, "Failed to transfer data\n");
    }

    /* once more, but this time passing MPI_IN_PLACE for the root */
    SetData(sendbuf, recvbuf, nx, ny, myrow, mycol, dims[0], dims[1]);
    MPI_Scatterv(sendbuf, sendcounts, scdispls, block,
                 (rank == 0 ? MPI_IN_PLACE : recvbuf), nx * ny, MPI_DOUBLE, 0, comm2d);
    errs_in_place = CheckData(recvbuf, nx, ny, myrow, mycol, dims[0], (rank == 0));
    if (errs_in_place) {
        fprintf(stdout, "Failed to transfer data (MPI_IN_PLACE)\n");
    }

    errs += errs_in_place;
    MPI_Allreduce(&errs, &tot_errs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    if (rank == 0) {
        if (tot_errs == 0)
            printf(" No Errors\n");
        else
            printf("%d errors in use of MPI_SCATTERV\n", tot_errs);
    }

    if (sendbuf)
        free(sendbuf);
    free(recvbuf);
    free(sendcounts);
    free(scdispls);
    MPI_Type_free(&block);
    MPI_Comm_free(&comm2d);
    MPI_Finalize();
    return errs;
}
예제 #10
0
파일: main.c 프로젝트: koichi626/GraphGPU
int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);

  setup_globals();

  /* Parse arguments. */
  int SCALE = 16;
  int edgefactor = 16; /* nedges / nvertices, i.e., 2*avg. degree */
  // if (argc >= 2) SCALE = atoi(argv[1]);
  // if (argc >= 3) edgefactor = atoi(argv[2]);
  char* name = argv[1];
  if (argc >= 3) SCALE = atoi(argv[2]);
  if (argc >= 4) edgefactor = atoi(argv[3]);
  // if (argc <= 1 || argc >= 4 || SCALE == 0 || edgefactor == 0) {
  //   if (rank == 0) {
  //     fprintf(stderr, "Usage: %s SCALE edgefactor\n  SCALE = log_2(# vertices) [integer, required]\n  edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]);
  //   }
  if (argc <= 2 || argc >= 5 || SCALE == 0 || edgefactor == 0) {
    if (rank == 0) {
      fprintf(stderr, "Usage: %s filename SCALE edgefactor\n  SCALE = log_2(# vertices) [integer, required]\n  edgefactor = (# edges) / (# vertices) = .5 * (average vertex degree) [integer, defaults to 16]\n(Random number seed and Kronecker initiator are in main.c)\n", argv[0]);
    }
    MPI_Abort(MPI_COMM_WORLD, 1);
  }
  uint64_t seed1 = 2, seed2 = 3;

  // const char* filename = getenv("TMPFILE");
  const char* filename = name;

  /* If filename is NULL, store data in memory */

  tuple_graph tg;
  tg.nglobaledges = (int64_t)(edgefactor) << SCALE;
  int64_t nglobalverts = (int64_t)(1) << SCALE;

  tg.data_in_file = (filename != NULL);

  if (tg.data_in_file) {
      printf("data in file \n");

    MPI_File_set_errhandler(MPI_FILE_NULL, MPI_ERRORS_ARE_FATAL);
    // MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_DELETE_ON_CLOSE | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile);
    MPI_File_open(MPI_COMM_WORLD, (char*)filename, MPI_MODE_RDWR | MPI_MODE_CREATE | MPI_MODE_EXCL | MPI_MODE_UNIQUE_OPEN, MPI_INFO_NULL, &tg.edgefile);
    MPI_File_set_size(tg.edgefile, tg.nglobaledges * sizeof(packed_edge));
    MPI_File_set_view(tg.edgefile, 0, packed_edge_mpi_type, packed_edge_mpi_type, "native", MPI_INFO_NULL);
    MPI_File_set_atomicity(tg.edgefile, 0);
  }

  /* Make the raw graph edges. */
  /* Get roots for BFS runs, plus maximum vertex with non-zero degree (used by
   * validator). */
  int num_bfs_roots = 64;
  int64_t* bfs_roots = (int64_t*)xmalloc(num_bfs_roots * sizeof(int64_t));
  int64_t max_used_vertex = 0;

  double make_graph_start = MPI_Wtime();
  {
    /* Spread the two 64-bit numbers into five nonzero values in the correct
     * range. */
    uint_fast32_t seed[5];
    make_mrg_seed(seed1, seed2, seed);

    /* As the graph is being generated, also keep a bitmap of vertices with
     * incident edges.  We keep a grid of processes, each row of which has a
     * separate copy of the bitmap (distributed among the processes in the
     * row), and then do an allreduce at the end.  This scheme is used to avoid
     * non-local communication and reading the file separately just to find BFS
     * roots. */
    MPI_Offset nchunks_in_file = (tg.nglobaledges + FILE_CHUNKSIZE - 1) / FILE_CHUNKSIZE;
    int64_t bitmap_size_in_bytes = int64_min(BITMAPSIZE, (nglobalverts + CHAR_BIT - 1) / CHAR_BIT);
    if (bitmap_size_in_bytes * size * CHAR_BIT < nglobalverts) {
      bitmap_size_in_bytes = (nglobalverts + size * CHAR_BIT - 1) / (size * CHAR_BIT);
    }
    int ranks_per_row = ((nglobalverts + CHAR_BIT - 1) / CHAR_BIT + bitmap_size_in_bytes - 1) / bitmap_size_in_bytes;
    int nrows = size / ranks_per_row;
    int my_row = -1, my_col = -1;
    unsigned char* restrict has_edge = NULL;
    MPI_Comm cart_comm;
    {
      int dims[2] = {size / ranks_per_row, ranks_per_row};
      int periods[2] = {0, 0};
      MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &cart_comm);
    }
    int in_generating_rectangle = 0;
    if (cart_comm != MPI_COMM_NULL) {
      in_generating_rectangle = 1;
      {
        int dims[2], periods[2], coords[2];
        MPI_Cart_get(cart_comm, 2, dims, periods, coords);
        my_row = coords[0];
        my_col = coords[1];
      }
      MPI_Comm this_col;
      MPI_Comm_split(cart_comm, my_col, my_row, &this_col);
      MPI_Comm_free(&cart_comm);
      has_edge = (unsigned char*)xMPI_Alloc_mem(bitmap_size_in_bytes);
      memset(has_edge, 0, bitmap_size_in_bytes);
      /* Every rank in a given row creates the same vertices (for updating the
       * bitmap); only one writes them to the file (or final memory buffer). */
      packed_edge* buf = (packed_edge*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge));
      MPI_Offset block_limit = (nchunks_in_file + nrows - 1) / nrows;
      // fprintf(stderr, "%d: nchunks_in_file = %" PRId64 ", block_limit = %" PRId64 " in grid of %d rows, %d cols\n", rank, (int64_t)nchunks_in_file, (int64_t)block_limit, nrows, ranks_per_row);
      if (tg.data_in_file) {
        tg.edgememory_size = 0;
        tg.edgememory = NULL;
      } else {
        int my_pos = my_row + my_col * nrows;
        int last_pos = (tg.nglobaledges % ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row) != 0) ?
                       (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row) :
                       -1;
        int64_t edges_left = tg.nglobaledges % FILE_CHUNKSIZE;
        int64_t nedges = FILE_CHUNKSIZE * (tg.nglobaledges / ((int64_t)FILE_CHUNKSIZE * nrows * ranks_per_row)) +
                         FILE_CHUNKSIZE * (my_pos < (tg.nglobaledges / FILE_CHUNKSIZE) % (nrows * ranks_per_row)) +
                         (my_pos == last_pos ? edges_left : 0);
        /* fprintf(stderr, "%d: nedges = %" PRId64 " of %" PRId64 "\n", rank, (int64_t)nedges, (int64_t)tg.nglobaledges); */
        tg.edgememory_size = nedges;
        tg.edgememory = (packed_edge*)xmalloc(nedges * sizeof(packed_edge));
      }
      MPI_Offset block_idx;
      for (block_idx = 0; block_idx < block_limit; ++block_idx) {
        /* fprintf(stderr, "%d: On block %d of %d\n", rank, (int)block_idx, (int)block_limit); */
        MPI_Offset start_edge_index = int64_min(FILE_CHUNKSIZE * (block_idx * nrows + my_row), tg.nglobaledges);
        MPI_Offset edge_count = int64_min(tg.nglobaledges - start_edge_index, FILE_CHUNKSIZE);
        packed_edge* actual_buf = (!tg.data_in_file && block_idx % ranks_per_row == my_col) ?
                                  tg.edgememory + FILE_CHUNKSIZE * (block_idx / ranks_per_row) :
                                  buf;
        /* fprintf(stderr, "%d: My range is [%" PRId64 ", %" PRId64 ") %swriting into index %" PRId64 "\n", rank, (int64_t)start_edge_index, (int64_t)(start_edge_index + edge_count), (my_col == (block_idx % ranks_per_row)) ? "" : "not ", (int64_t)(FILE_CHUNKSIZE * (block_idx / ranks_per_row))); */
        if (!tg.data_in_file && block_idx % ranks_per_row == my_col) {
          assert (FILE_CHUNKSIZE * (block_idx / ranks_per_row) + edge_count <= tg.edgememory_size);
        }

	// debug
	char* wtxbuf = (char*)xmalloc(FILE_CHUNKSIZE * sizeof(packed_edge));

        // generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf);
        generate_kronecker_range(seed, SCALE, start_edge_index, start_edge_index + edge_count, actual_buf);
        if (tg.data_in_file && my_col == (block_idx % ranks_per_row)) { /* Try to spread writes among ranks */
          // MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE);


	    // debug
	    printf("%d: %d, %d\n", rank, start_edge_index, edge_count);
	    int i;
	    // for (i = start_edge_index; i < start_edge_index + 3; i++) {
	    // if(block_idx == 0) {
	    // 	for (i = 0; i < 3; i++) {
	    // 	    if (edge_count > 3)
	    // 		printf("%d: %d\t%d\n", rank, actual_buf[i].v0, actual_buf[i].v1);
	    // 	}

	    // }

	    
	    

          MPI_File_write_at(tg.edgefile, start_edge_index, actual_buf, edge_count, packed_edge_mpi_type, MPI_STATUS_IGNORE);
        }
        ptrdiff_t i;
#ifdef _OPENMP
#pragma omp parallel for
#endif
        for (i = 0; i < edge_count; ++i) {
          int64_t src = get_v0_from_edge(&actual_buf[i]);
          int64_t tgt = get_v1_from_edge(&actual_buf[i]);
          if (src == tgt) continue;
          if (src / bitmap_size_in_bytes / CHAR_BIT == my_col) {
#ifdef _OPENMP
#pragma omp atomic
#endif
            has_edge[(src / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (src % CHAR_BIT));
          }
          if (tgt / bitmap_size_in_bytes / CHAR_BIT == my_col) {
#ifdef _OPENMP
#pragma omp atomic
#endif
            has_edge[(tgt / CHAR_BIT) % bitmap_size_in_bytes] |= (1 << (tgt % CHAR_BIT));
          }
        }
      }
      free(buf);
#if 0
      /* The allreduce for each root acts like we did this: */
      MPI_Allreduce(MPI_IN_PLACE, has_edge, bitmap_size_in_bytes, MPI_UNSIGNED_CHAR, MPI_BOR, this_col);
#endif
      MPI_Comm_free(&this_col);
    } else {
      tg.edgememory = NULL;
      tg.edgememory_size = 0;
    }
    MPI_Allreduce(&tg.edgememory_size, &tg.max_edgememory_size, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);

#ifndef GEN_ONLY
    /* Find roots and max used vertex */
    {
      uint64_t counter = 0;
      int bfs_root_idx;
      for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) {
        int64_t root;
        while (1) {
          double d[2];
          make_random_numbers(2, seed1, seed2, counter, d);
          root = (int64_t)((d[0] + d[1]) * nglobalverts) % nglobalverts;
          counter += 2;
          if (counter > 2 * nglobalverts) break;
          int is_duplicate = 0;
          int i;
          for (i = 0; i < bfs_root_idx; ++i) {
            if (root == bfs_roots[i]) {
              is_duplicate = 1;
              break;
            }
          }
          if (is_duplicate) continue; /* Everyone takes the same path here */
          int root_ok = 0;
          if (in_generating_rectangle && (root / CHAR_BIT / bitmap_size_in_bytes) == my_col) {
            root_ok = (has_edge[(root / CHAR_BIT) % bitmap_size_in_bytes] & (1 << (root % CHAR_BIT))) != 0;
          }
          MPI_Allreduce(MPI_IN_PLACE, &root_ok, 1, MPI_INT, MPI_LOR, MPI_COMM_WORLD);
          if (root_ok) break;
        }
        bfs_roots[bfs_root_idx] = root;
      }
      num_bfs_roots = bfs_root_idx;

      /* Find maximum non-zero-degree vertex. */
      {
        int64_t i;
        max_used_vertex = 0;
        if (in_generating_rectangle) {
          for (i = bitmap_size_in_bytes * CHAR_BIT; i > 0; --i) {
            if (i > nglobalverts) continue;
            if (has_edge[(i - 1) / CHAR_BIT] & (1 << ((i - 1) % CHAR_BIT))) {
              max_used_vertex = (i - 1) + my_col * CHAR_BIT * bitmap_size_in_bytes;
              break;
            }
          }
        }
        MPI_Allreduce(MPI_IN_PLACE, &max_used_vertex, 1, MPI_INT64_T, MPI_MAX, MPI_COMM_WORLD);
      }
    }
#endif

    if (in_generating_rectangle) {
      MPI_Free_mem(has_edge);
    }
    if (tg.data_in_file) {
      MPI_File_sync(tg.edgefile);
    }
  }

  double make_graph_stop = MPI_Wtime();
  double make_graph_time = make_graph_stop - make_graph_start;
  if (rank == 0) { /* Not an official part of the results */
    fprintf(stderr, "graph_generation:               %f s\n", make_graph_time);
  }


  //debug
#ifndef GEN_ONLY //!GEN_ONLY

  /* Make user's graph data structure. */
  double data_struct_start = MPI_Wtime();
  make_graph_data_structure(&tg);
  double data_struct_stop = MPI_Wtime();
  double data_struct_time = data_struct_stop - data_struct_start;
  if (rank == 0) { /* Not an official part of the results */
    fprintf(stderr, "construction_time:              %f s\n", data_struct_time);
  }

  /* Number of edges visited in each BFS; a double so get_statistics can be
   * used directly. */
  double* edge_counts = (double*)xmalloc(num_bfs_roots * sizeof(double));

  /* Run BFS. */
  int validation_passed = 1;
  double* bfs_times = (double*)xmalloc(num_bfs_roots * sizeof(double));
  double* validate_times = (double*)xmalloc(num_bfs_roots * sizeof(double));
  uint64_t nlocalverts = get_nlocalverts_for_pred();
  int64_t* pred = (int64_t*)xMPI_Alloc_mem(nlocalverts * sizeof(int64_t));

  int bfs_root_idx;
  for (bfs_root_idx = 0; bfs_root_idx < num_bfs_roots; ++bfs_root_idx) {
    int64_t root = bfs_roots[bfs_root_idx];

    if (rank == 0) fprintf(stderr, "Running BFS %d\n", bfs_root_idx);

    /* Clear the pred array. */
    memset(pred, 0, nlocalverts * sizeof(int64_t));

    /* Do the actual BFS. */
    double bfs_start = MPI_Wtime();
    run_bfs(root, &pred[0]);
    double bfs_stop = MPI_Wtime();
    bfs_times[bfs_root_idx] = bfs_stop - bfs_start;
    if (rank == 0) fprintf(stderr, "Time for BFS %d is %f\n", bfs_root_idx, bfs_times[bfs_root_idx]);

    /* Validate result. */
    if (rank == 0) fprintf(stderr, "Validating BFS %d\n", bfs_root_idx);

    double validate_start = MPI_Wtime();
    int64_t edge_visit_count;
    int validation_passed_one = validate_bfs_result(&tg, max_used_vertex + 1, nlocalverts, root, pred, &edge_visit_count);
    double validate_stop = MPI_Wtime();
    validate_times[bfs_root_idx] = validate_stop - validate_start;
    if (rank == 0) fprintf(stderr, "Validate time for BFS %d is %f\n", bfs_root_idx, validate_times[bfs_root_idx]);
    edge_counts[bfs_root_idx] = (double)edge_visit_count;
    if (rank == 0) fprintf(stderr, "TEPS for BFS %d is %g\n", bfs_root_idx, edge_visit_count / bfs_times[bfs_root_idx]);

    if (!validation_passed_one) {
      validation_passed = 0;
      if (rank == 0) fprintf(stderr, "Validation failed for this BFS root; skipping rest.\n");
      break;
    }
  }

  MPI_Free_mem(pred);
  free(bfs_roots);
  free_graph_data_structure();

#endif //!GEN_ONLY

  if (tg.data_in_file) {
    MPI_File_close(&tg.edgefile);
  } else {
    free(tg.edgememory); tg.edgememory = NULL;
  }

#ifndef GEN_ONLY
  /* Print results. */
  if (rank == 0) {
    if (!validation_passed) {
      fprintf(stdout, "No results printed for invalid run.\n");
    } else {
      int i;
      fprintf(stdout, "SCALE:                          %d\n", SCALE);
      fprintf(stdout, "edgefactor:                     %d\n", edgefactor);
      fprintf(stdout, "NBFS:                           %d\n", num_bfs_roots);
      fprintf(stdout, "graph_generation:               %g\n", make_graph_time);
      fprintf(stdout, "num_mpi_processes:              %d\n", size);
      fprintf(stdout, "construction_time:              %g\n", data_struct_time);
      double stats[s_LAST];
      get_statistics(bfs_times, num_bfs_roots, stats);
      fprintf(stdout, "min_time:                       %g\n", stats[s_minimum]);
      fprintf(stdout, "firstquartile_time:             %g\n", stats[s_firstquartile]);
      fprintf(stdout, "median_time:                    %g\n", stats[s_median]);
      fprintf(stdout, "thirdquartile_time:             %g\n", stats[s_thirdquartile]);
      fprintf(stdout, "max_time:                       %g\n", stats[s_maximum]);
      fprintf(stdout, "mean_time:                      %g\n", stats[s_mean]);
      fprintf(stdout, "stddev_time:                    %g\n", stats[s_std]);
      get_statistics(edge_counts, num_bfs_roots, stats);
      fprintf(stdout, "min_nedge:                      %.11g\n", stats[s_minimum]);
      fprintf(stdout, "firstquartile_nedge:            %.11g\n", stats[s_firstquartile]);
      fprintf(stdout, "median_nedge:                   %.11g\n", stats[s_median]);
      fprintf(stdout, "thirdquartile_nedge:            %.11g\n", stats[s_thirdquartile]);
      fprintf(stdout, "max_nedge:                      %.11g\n", stats[s_maximum]);
      fprintf(stdout, "mean_nedge:                     %.11g\n", stats[s_mean]);
      fprintf(stdout, "stddev_nedge:                   %.11g\n", stats[s_std]);
      double* secs_per_edge = (double*)xmalloc(num_bfs_roots * sizeof(double));
      for (i = 0; i < num_bfs_roots; ++i) secs_per_edge[i] = bfs_times[i] / edge_counts[i];
      get_statistics(secs_per_edge, num_bfs_roots, stats);
      fprintf(stdout, "min_TEPS:                       %g\n", 1. / stats[s_maximum]);
      fprintf(stdout, "firstquartile_TEPS:             %g\n", 1. / stats[s_thirdquartile]);
      fprintf(stdout, "median_TEPS:                    %g\n", 1. / stats[s_median]);
      fprintf(stdout, "thirdquartile_TEPS:             %g\n", 1. / stats[s_firstquartile]);
      fprintf(stdout, "max_TEPS:                       %g\n", 1. / stats[s_minimum]);
      fprintf(stdout, "harmonic_mean_TEPS:             %g\n", 1. / stats[s_mean]);
      /* Formula from:
       * Title: The Standard Errors of the Geometric and Harmonic Means and
       *        Their Application to Index Numbers
       * Author(s): Nilan Norris
       * Source: The Annals of Mathematical Statistics, Vol. 11, No. 4 (Dec., 1940), pp. 445-448
       * Publisher(s): Institute of Mathematical Statistics
       * Stable URL: http://www.jstor.org/stable/2235723
       * (same source as in specification). */
      fprintf(stdout, "harmonic_stddev_TEPS:           %g\n", stats[s_std] / (stats[s_mean] * stats[s_mean] * sqrt(num_bfs_roots - 1)));
      free(secs_per_edge); secs_per_edge = NULL;
      free(edge_counts); edge_counts = NULL;
      get_statistics(validate_times, num_bfs_roots, stats);
      fprintf(stdout, "min_validate:                   %g\n", stats[s_minimum]);
      fprintf(stdout, "firstquartile_validate:         %g\n", stats[s_firstquartile]);
      fprintf(stdout, "median_validate:                %g\n", stats[s_median]);
      fprintf(stdout, "thirdquartile_validate:         %g\n", stats[s_thirdquartile]);
      fprintf(stdout, "max_validate:                   %g\n", stats[s_maximum]);
      fprintf(stdout, "mean_validate:                  %g\n", stats[s_mean]);
      fprintf(stdout, "stddev_validate:                %g\n", stats[s_std]);
#if 0
      for (i = 0; i < num_bfs_roots; ++i) {
        fprintf(stdout, "Run %3d:                        %g s, validation %g s\n", i + 1, bfs_times[i], validate_times[i]);
      }
#endif
    }
  }
  free(bfs_times);
  free(validate_times);

#endif
  cleanup_globals();
  MPI_Finalize();
  return 0;
}
예제 #11
0
파일: init.c 프로젝트: fweik/scafacos
FCSResult ifcs_p2nfft_init(
    void **rd, MPI_Comm comm
    )
{
  const char *fnc_name = "ifcs_p2nfft_init";
  ifcs_p2nfft_data_struct *d;

  /* return error if method context is already allocated */
  if (*rd != NULL) 
    return fcs_result_create(FCS_ERROR_LOGICAL_ERROR, fnc_name, "Multiple init of method context without finalize.");
  
  /* Initialize the PNFFT library */
  FCS_PNFFT(init)();
 
  /* Create data structure */ 
  d = mkplan_p2nfft();

  /* return error if allocation failed */
  if (d == NULL)
    return fcs_result_create(FCS_ERROR_ALLOC_FAILED, fnc_name, "Allocation of the p2nfft data structure failed.");

#if FCS_P2NFFT_USE_3D_PROCMESH 
  /* Create a three-dimensional cartesian comm
     from the given (possibly non-cartesian) one. */
  if( !comm_is_cart_3d(comm) )
    comm_create_cart_3d(comm, &d->cart_comm_3d, d->np);
  else {
    int periods[3], coords[3];
    MPI_Cart_get(comm, 3,
        d->np, periods, coords);

    if( periods[0] && periods[1] && periods[2] )
      MPI_Comm_dup(comm, &d->cart_comm_3d);
    else {
      for(int t=0; t<3; t++)
        periods[t] = 1;
      MPI_Cart_create(comm, 3, d->np, periods, 0, &d->cart_comm_3d);
    }
  }
  MPI_Comm_dup(d->cart_comm_3d, &d->cart_comm_pnfft);
#else  
  /* create 2d cart procmesh for PNFFT and its 3d counterpart */
  comm_create_cart_2d(comm, &d->cart_comm_pnfft, &d->cart_comm_3d, d->np);
#endif

  /* Set the default values */
  d->needs_retune = 1;
  d->tune_alpha = 1;
  d->tune_r_cut = 1;
  d->tune_epsI = 1;
  d->tune_epsB = 1;
  d->tune_k_cut = 1;
  d->tune_N = 1;
  d->tune_n = 1;
  d->tune_m = 1;
  d->tune_p = 1;
  d->tune_b = 1;
  d->tune_c = 1;
#if FCS_ENABLE_INFO 
  d->flags = FCS_P2NFFT_VERBOSE_TUNING; 
#else
  d->flags = 0; 
#endif

  d->pnfft_flags = PNFFT_MALLOC_F_HAT| PNFFT_PRE_PHI_HAT | PNFFT_FFT_OUT_OF_PLACE | PNFFT_TRANSPOSED_F_HAT;
  d->pnfft_interpolation_order = 3;
  d->pnfft_window = FCS_P2NFFT_DEFAULT_PNFFT_WINDOW;
  d->pnfft_direct = 0;
  d->pfft_flags = PFFT_NO_TUNE | PFFT_DESTROY_INPUT;
  d->pfft_patience = FCS_P2NFFT_DEFAULT_PFFT_PATIENCE;

  /* We do not know the default tolerance type at this point, since periodicity
   * may be changed via fcs_set_periodicity after fcs_init. */
  d->tolerance_type = FCS_TOLERANCE_TYPE_UNDEFINED;
  d->tolerance = -1.0;
  
  d->N[0] = d->N[1] = d->N[2] = 16;
  d->m = 4;
  d->p = 8;
  d->c = 0.0;
  d->b[0] = d->b[1] = d->b[2] = 0.0;

  /* init to same nonsense on all processes */
  d->alpha = -1.0;
  d->r_cut = -1.0;
  d->one_over_r_cut = -1.0;
  d->epsI = -1.0;
  d->epsB = -1.0;
  d->k_cut = -1.0;
  d->num_nodes = -1;
  d->sum_qpart = -1;
  d->sum_q2 = -1.0;
  d->sum_q = 0.0;
  d->bg_charge = 0.0;
  d->box_V = 0.0;
  for(int t=0; t<3; t++){
    d->box_l[t] = -1.0; 
    d->box_expand[t] = 1.0;
    d->box_scales[t] = 1.0;
    d->box_a[t] = 0.0;
    d->box_b[t] = 0.0;
    d->box_c[t] = 0.0;
  }
  for(int t=0; t<9; t++)
    d->box_inv[t] = 0.0;
  
  comm_get_periodicity(comm, d->periodicity);

  d->short_range_flag = -1;
  d->reg_near = FCS_P2NFFT_REG_NEAR_DEFAULT;
  d->reg_far  = FCS_P2NFFT_REG_FAR_DEFAULT;
  d->reg_kernel   = FCS_P2NFFT_REG_KERNEL_DEFAULT;

  /* init local data distribution of PNFFT:
   * local_N, local_N_start, lower_border, upper_border */
  for(int t=0; t<3; t++){
    d->local_N[t] = -1;
    d->local_N_start[t] = -1;
    d->lower_border[t] = -1;
    d->upper_border[t] = -1;
  }

  d->regkern_hat = NULL;  

  /* init gridsort data */
  d->max_particle_move = -1;
  d->resort = d->local_num_particles = 0;
  d->gridsort_resort = FCS_GRIDSORT_RESORT_NULL;
  d->gridsort_cache = FCS_GRIDSORT_CACHE_NULL;

  *rd = d;

  return NULL;
}
예제 #12
0
void
initcomm(int ndx,int ndy,int ndz)
{
  int  i,j,k,tmp;
  int  ipd[3],idm[3],ir;
  MPI_Comm  icomm;

  if(ndx*ndy*ndz != npe){
    if(id==0){
      printf("Invalid number of PE\n");
      printf("Please check partitioning pattern or number of PE\n");
    }
    MPI_Finalize();
    exit(0);
  }

  icomm= MPI_COMM_WORLD;

  idm[0]= ndx;
  idm[1]= ndy;
  idm[2]= ndz;

  ipd[0]= 0;
  ipd[1]= 0;
  ipd[2]= 0;
  ir= 0;


  MPI_Cart_create(icomm,
                  ndims,
                  idm,
                  ipd,
                  ir,
                  &mpi_comm_cart);
  MPI_Cart_get(mpi_comm_cart,
               ndims,
               idm,
               ipd,
               iop);

  if(ndz > 1){
    MPI_Cart_shift(mpi_comm_cart,
                   2,
                   1,
                   &npz[0],
                   &npz[1]);
  }
  if(ndy > 1){
    MPI_Cart_shift(mpi_comm_cart,
                   1,
                   1,
                   &npy[0],
                   &npy[1]);
  }
  if(ndx > 1){
    MPI_Cart_shift(mpi_comm_cart,
                   0,
                   1,
                   &npx[0],
                   &npx[1]);
  }

}
예제 #13
0
파일: cartzero.c 프로젝트: NexMirror/MPICH
/*
    Check that the MPI implementation properly handles zero-dimensional
    Cartesian communicators - the original standard implies that these
    should be consistent with higher dimensional topologies and thus
    these should work with any MPI implementation.  MPI 2.1 made this
    requirement explicit.
*/
int main(int argc, char *argv[])
{
    int errs = 0;
    int size, rank, ndims;
    MPI_Comm comm, newcomm;

    MTest_Init(&argc, &argv);

    /* Create a new cartesian communicator in a subset of the processes */
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    if (size < 2) {
        fprintf(stderr, "This test needs at least 2 processes\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    MPI_Cart_create(MPI_COMM_WORLD, 0, NULL, NULL, 0, &comm);

    if (comm != MPI_COMM_NULL) {
        int csize;
        MPI_Comm_size(comm, &csize);
        if (csize != 1) {
            errs++;
            fprintf(stderr, "Sizes is wrong in cart communicator.  Is %d, should be 1\n", csize);
        }

        /* This function is not meaningful, but should not fail */
        MPI_Dims_create(1, 0, NULL);

        ndims = -1;
        MPI_Cartdim_get(comm, &ndims);
        if (ndims != 0) {
            errs++;
            fprintf(stderr, "MPI_Cartdim_get: ndims is %d, should be 0\n", ndims);
        }

        /* this function should not fail */
        MPI_Cart_get(comm, 0, NULL, NULL, NULL);

        MPI_Cart_rank(comm, NULL, &rank);
        if (rank != 0) {
            errs++;
            fprintf(stderr, "MPI_Cart_rank: rank is %d, should be 0\n", rank);
        }

        /* this function should not fail */
        MPI_Cart_coords(comm, 0, 0, NULL);

        MPI_Cart_sub(comm, NULL, &newcomm);
        ndims = -1;
        MPI_Cartdim_get(newcomm, &ndims);
        if (ndims != 0) {
            errs++;
            fprintf(stderr, "MPI_Cart_sub did not return zero-dimensional communicator\n");
        }

        MPI_Barrier(comm);

        MPI_Comm_free(&comm);
        MPI_Comm_free(&newcomm);
    }
    else if (rank == 0) {
        errs++;
        fprintf(stderr, "Communicator returned is null!");
    }

    MTest_Finalize(errs);

    MPI_Finalize();

    return 0;
}
예제 #14
0
  void Communication::prepare(p3m_float box_l[3]) {
    P3M_DEBUG(printf( "  P3M::Communication::prepare() started...\n"));

    /* Test whether the communicator is cartesian and correct dimensionality */
    bool comm_is_cart = false;
    int status;

    MPI_Topo_test(mpicomm_orig, &status);
    if (status == MPI_CART) {
      /* Communicator is cartesian, so test dimensionality */
      int ndims;
      MPI_Cartdim_get(mpicomm_orig, &ndims);
      if (ndims == 3) {
        /* Correct dimensionality, so get grid and test periodicity */
        int periodicity[3];
        MPI_Cart_get(mpicomm_orig, 3, node_grid,
                     periodicity, node_pos);
        if (periodicity[0] && periodicity[1] && periodicity[2]) {
          /* If periodicity is correct, we can just use this communicator */
          mpicomm = mpicomm_orig;
          /* get the rank */
          MPI_Comm_rank(mpicomm, &rank);
          comm_is_cart = true;
        }
      }
    }

    /* otherwise, we have to set up the cartesian communicator */
    if (!comm_is_cart) {
      P3M_DEBUG(printf( "    Setting up cartesian communicator...\n"));

      node_grid[0] = 0;
      node_grid[1] = 0;
      node_grid[2] = 0;

      /* compute node grid */
      MPI_Dims_create(size, 3, node_grid);

#ifdef P3M_ENABLE_INFO
      if (onMaster())
    	  printf("    node_grid=%dx%dx%d\n", node_grid[0], node_grid[1], node_grid[2]);
#endif

      /* create communicator */
      int periodicity[3] = {1, 1, 1};
      MPI_Cart_create(mpicomm_orig, 3, node_grid,
                      periodicity, 1, &mpicomm);

      /* get the rank */
      MPI_Comm_rank(mpicomm, &rank);
      /* get node pos */
      MPI_Cart_coords(mpicomm, rank, 3, node_pos);
    }
    
    /* fetch neighborhood info */
    for (int dir = 0; dir < 3; dir++) {
      MPI_Cart_shift(mpicomm, dir, 1,
                     &node_neighbors[2*dir],
                     &node_neighbors[2*dir+1]);
      P3M_DEBUG_LOCAL(printf( "    %d: dir=%d: n1=%d n2=%d\n", rank, dir, \
                              node_neighbors[2*dir],		\
                              node_neighbors[2*dir+1]));
    }

    /* init local points */
    for (int i=0; i< 3; i++) {
      local_box_l[i] = 0.0;
      my_left[i] = 0.0;
      my_right[i] = 0.0;
    }

    /* compute box limits */
    for(p3m_int i = 0; i < 3; i++) {
      local_box_l[i] = box_l[i]/(p3m_float)node_grid[i];
      my_left[i]   = node_pos[i]    *local_box_l[i];
      my_right[i]  = (node_pos[i]+1)*local_box_l[i];
    }
    P3M_DEBUG(printf("    local_box_l=" F3FLOAT "\n"                      \
                     "    my_left=" F3FLOAT "\n"                          \
                     "    my_right=" F3FLOAT "\n",                        \
                     local_box_l[0],                                \
                     local_box_l[1],                                \
                     local_box_l[2],                                \
                     my_left[0], my_left[1], my_left[2], \
                     my_right[0], my_right[1], my_right[2] \
                     ));

    P3M_DEBUG(printf("  P3M::Communication::prepare() finished.\n"));
  }
예제 #15
0
/**
 * Creates a 3D single precision R2C parallel FFT plan. If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpuf*  accfft_plan_dft_3d_r2c_gpuf(int * n, float * data_d,float * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpuf *plan=new accfft_plan_gpuf;
  int procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  plan->data=data_d;
  plan->data_out=data_out_d;

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;


  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_max=0;
  int n_tuples_i, n_tuples_o;
  //plan->inplace==true ? n_tuples=(n[2]/2+1)*2: n_tuples=n[2]*2;
  plan->inplace==true ? n_tuples_i=(n[2]/2+1)*2:  n_tuples_i=n[2];
  n_tuples_o=(n[2]/2+1)*2;

  int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_r2c_gpuf(n,isize,istart,osize,ostart,c_comm,plan->inplace);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpuf(n[0],n[1],n_tuples_o,osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpuf(n[0],n_tuples_o/2,n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpuf(n[1],n_tuples_o/2,n[0],osize_2,ostart_2,c_comm);

  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }

  // fplan_0
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;
  {
    int f_inembed[1]={n_tuples_i};
    int f_onembed[1]={n_tuples_o/2};
    int idist=(n_tuples_i);
    int odist=n_tuples_o/2;
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_R2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->iplan_0, 1, &n[2],
          f_onembed, ostride, odist, // *onembed, ostride, odist
          f_inembed, istride, idist, // *inembed, istride, idist
          CUFFT_C2R, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: iplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_C2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_C2C, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }


  // 1D Decomposition
  if(plan->oneD){
    int N0=n[0], N1=n[1], N2=n[2];

    plan->Mem_mgr  = new Mem_Mgr_gpu<float>(N0,N1,n_tuples_o,c_comm);
    plan->T_plan_2 = new T_Plan_gpu <float>(N0,N1,n_tuples_o, plan->Mem_mgr, c_comm);
    plan->T_plan_2i= new T_Plan_gpu <float>(N1,N0,n_tuples_o,plan->Mem_mgr, c_comm);
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;

    plan->alloc_max=alloc_max;
    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;


    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,data_out_d);
    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2->method =plan->T_plan_2->method;
    plan->T_plan_2i->method=plan->T_plan_2->method;

    plan->T_plan_2->kway =plan->T_plan_2->kway;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;


  }

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr  =new Mem_Mgr_gpu<float>(n[1],n_tuples_o/2,2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1 = new T_Plan_gpu<float>(n[1],n_tuples_o/2,2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2 = new T_Plan_gpu<float>(n[0],n[1],osize_2[2]*2,plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<float>(n[1],n[0],osize_2i[2]*2, plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<float>(n_tuples_o/2,n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);


    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;



    if(flags==ACCFFT_MEASURE){
      if(coord[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );

    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);
    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=plan->T_plan_1->method;
    plan->T_plan_1i->method=plan->T_plan_1->method;
    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->iplan_1=-1;
    plan->iplan_2=-1;

  }

  plan->r2c_plan_baked=true;
  return plan;

}
예제 #16
0
void fcs_memd_setup_communicator(memd_struct* memd, MPI_Comm communicator)
{
    /* store given communicator */
    memd->mpiparams.original_comm = communicator;
    MPI_Comm_size(communicator, &memd->mpiparams.size);

    /* Test whether the communicator is cartesian and correct dimensionality */
    int comm_is_cart = 0;
    int status;

    MPI_Topo_test(communicator, &status);
    if (status == MPI_CART) {
        /* Communicator is cartesian, so test dimensionality */
        int ndims;
        MPI_Cartdim_get(communicator, &ndims);
        if (ndims == 3) {
            /* Correct dimensionality, so get grid and test periodicity */
            int periodicity[3];
            MPI_Cart_get(communicator, 3, memd->mpiparams.node_grid, periodicity, memd->mpiparams.node_pos);
            if (periodicity[0] && periodicity[1] && periodicity[2]) {
                /* If periodicity is correct, we can just use this communicator */
                memd->mpiparams.communicator = communicator;
                /* get the rank */
                MPI_Comm_rank(communicator, &memd->mpiparams.this_node);
                comm_is_cart = 1;
            }
        }
    }
    
    /* otherwise, we have to set up the cartesian communicator */
    if (!comm_is_cart) {        
        memd->mpiparams.node_grid[0] = 0.0;
        memd->mpiparams.node_grid[1] = 0.0;
        memd->mpiparams.node_grid[2] = 0.0;
        
        /* compute node grid */
        MPI_Dims_create(memd->mpiparams.size, 3, memd->mpiparams.node_grid);
        /* swap first and last dimension, as MEMD currently wants to have them increasing */
        fcs_int tmp = memd->mpiparams.node_grid[2];
        memd->mpiparams.node_grid[2] = memd->mpiparams.node_grid[0];
        memd->mpiparams.node_grid[0] = tmp;
        
        /* create communicator */
        int periodicity[3] = {1, 1, 1};
        MPI_Cart_create(memd->mpiparams.original_comm, 3, memd->mpiparams.node_grid, periodicity, 1, &memd->mpiparams.communicator);
        
        /* get the rank */
        MPI_Comm_rank(communicator, &memd->mpiparams.this_node);
        /* get node pos */
        MPI_Cart_coords(memd->mpiparams.communicator, memd->mpiparams.this_node, 3, memd->mpiparams.node_pos);
    }
    
    /* fetch neighborhood info */
    for (int dir = 0; dir<3; dir++) {
        MPI_Cart_shift(memd->mpiparams.communicator, dir, 1, 
                       &memd->mpiparams.node_neighbors[2*dir], 
                       &memd->mpiparams.node_neighbors[2*dir+1]);
    }
    
    /* init local points */
    for (int i=0; i< 3; i++) {
//        memd->mpiparams.local_box_l[i] = 0.0;
        memd->mpiparams.my_left[i] = 0.0;
        memd->mpiparams.my_right[i] = 0.0;
    }
    
    /* compute box limits */
    fcs_float local_box_length = 0.0;
    for(fcs_int i = 0; i < 3; i++) {
        local_box_length = memd->parameters.box_length[i] / (fcs_float)memd->mpiparams.node_grid[i];
        memd->mpiparams.my_left[i]   = memd->mpiparams.node_pos[i] * local_box_length;
        memd->mpiparams.my_right[i]  = (memd->mpiparams.node_pos[i]+1) * local_box_length;
    }

}
예제 #17
0
파일: comm.cpp 프로젝트: nlslatt/sst-macro
int Comm::setup(MMD_float cutneigh, Atom &atom)
{
  int i;
  int nprocs;
  int periods[3];
  MMD_float prd[3];
  int myloc[3];
  MPI_Comm cartesian;
  MMD_float lo, hi;
  int ineed, idim, nbox;

  prd[0] = atom.box.xprd;
  prd[1] = atom.box.yprd;
  prd[2] = atom.box.zprd;

  /* setup 3-d grid of procs */

  MPI_Comm_rank(MPI_COMM_WORLD, &me);
  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);

  MMD_float area[3];

  area[0] = prd[0] * prd[1];
  area[1] = prd[0] * prd[2];
  area[2] = prd[1] * prd[2];

  MMD_float bestsurf = 2.0 * (area[0] + area[1] + area[2]);

  // loop thru all possible factorizations of nprocs
  // surf = surface area of a proc sub-domain
  // for 2d, insure ipz = 1

  int ipx, ipy, ipz, nremain;
  MMD_float surf;

  ipx = 1;

  while(ipx <= nprocs) {
    if(nprocs % ipx == 0) {
      nremain = nprocs / ipx;
      ipy = 1;

      while(ipy <= nremain) {
        if(nremain % ipy == 0) {
          ipz = nremain / ipy;
          surf = area[0] / ipx / ipy + area[1] / ipx / ipz + area[2] / ipy / ipz;

          if(surf < bestsurf) {
            bestsurf = surf;
            procgrid[0] = ipx;
            procgrid[1] = ipy;
            procgrid[2] = ipz;
          }
        }

        ipy++;
      }
    }

    ipx++;
  }

  if(procgrid[0]*procgrid[1]*procgrid[2] != nprocs) {
    if(me == 0) printf("ERROR: Bad grid of processors\n");

    return 1;
  }

  /* determine where I am and my neighboring procs in 3d grid of procs */

  int reorder = 0;
  periods[0] = periods[1] = periods[2] = 1;

  MPI_Cart_create(MPI_COMM_WORLD, 3, procgrid, periods, reorder, &cartesian);
  MPI_Cart_get(cartesian, 3, procgrid, periods, myloc);
  MPI_Cart_shift(cartesian, 0, 1, &procneigh[0][0], &procneigh[0][1]);
  MPI_Cart_shift(cartesian, 1, 1, &procneigh[1][0], &procneigh[1][1]);
  MPI_Cart_shift(cartesian, 2, 1, &procneigh[2][0], &procneigh[2][1]);

  /* lo/hi = my local box bounds */

  atom.box.xlo = myloc[0] * prd[0] / procgrid[0];
  atom.box.xhi = (myloc[0] + 1) * prd[0] / procgrid[0];
  atom.box.ylo = myloc[1] * prd[1] / procgrid[1];
  atom.box.yhi = (myloc[1] + 1) * prd[1] / procgrid[1];
  atom.box.zlo = myloc[2] * prd[2] / procgrid[2];
  atom.box.zhi = (myloc[2] + 1) * prd[2] / procgrid[2];

  /* need = # of boxes I need atoms from in each dimension */

  need[0] = static_cast<int>(cutneigh * procgrid[0] / prd[0] + 1);
  need[1] = static_cast<int>(cutneigh * procgrid[1] / prd[1] + 1);
  need[2] = static_cast<int>(cutneigh * procgrid[2] / prd[2] + 1);

  /* alloc comm memory */

  int maxswap = 2 * (need[0] + need[1] + need[2]);

  slablo = (MMD_float*) malloc(maxswap * sizeof(MMD_float));
  slabhi = (MMD_float*) malloc(maxswap * sizeof(MMD_float));
  pbc_any = (int*) malloc(maxswap * sizeof(int));
  pbc_flagx = (int*) malloc(maxswap * sizeof(int));
  pbc_flagy = (int*) malloc(maxswap * sizeof(int));
  pbc_flagz = (int*) malloc(maxswap * sizeof(int));
  sendproc = (int*) malloc(maxswap * sizeof(int));
  recvproc = (int*) malloc(maxswap * sizeof(int));
  sendproc_exc = (int*) malloc(maxswap * sizeof(int));
  recvproc_exc = (int*) malloc(maxswap * sizeof(int));
  sendnum = (int*) malloc(maxswap * sizeof(int));
  recvnum = (int*) malloc(maxswap * sizeof(int));
  comm_send_size = (int*) malloc(maxswap * sizeof(int));
  comm_recv_size = (int*) malloc(maxswap * sizeof(int));
  reverse_send_size = (int*) malloc(maxswap * sizeof(int));
  reverse_recv_size = (int*) malloc(maxswap * sizeof(int));
  int iswap = 0;

  for(int idim = 0; idim < 3; idim++)
    for(int i = 1; i <= need[idim]; i++, iswap += 2) {
      MPI_Cart_shift(cartesian, idim, i, &sendproc_exc[iswap], &sendproc_exc[iswap + 1]);
      MPI_Cart_shift(cartesian, idim, i, &recvproc_exc[iswap + 1], &recvproc_exc[iswap]);
    }

  MPI_Comm_free(&cartesian);

  firstrecv = (int*) malloc(maxswap * sizeof(int));
  maxsendlist = (int*) malloc(maxswap * sizeof(int));

  for(i = 0; i < maxswap; i++) maxsendlist[i] = BUFMIN;

  sendlist = (int**) malloc(maxswap * sizeof(int*));

  for(i = 0; i < maxswap; i++)
    sendlist[i] = (int*) malloc(BUFMIN * sizeof(int));

  /* setup 4 parameters for each exchange: (spart,rpart,slablo,slabhi)
     sendproc(nswap) = proc to send to at each swap
     recvproc(nswap) = proc to recv from at each swap
     slablo/slabhi(nswap) = slab boundaries (in correct dimension) of atoms
                            to send at each swap
     1st part of if statement is sending to the west/south/down
     2nd part of if statement is sending to the east/north/up
     nbox = atoms I send originated in this box */

  /* set commflag if atoms are being exchanged across a box boundary
     commflag(idim,nswap) =  0 -> not across a boundary
                          =  1 -> add box-length to position when sending
                          = -1 -> subtract box-length from pos when sending */

  nswap = 0;

  for(idim = 0; idim < 3; idim++) {
    for(ineed = 0; ineed < 2 * need[idim]; ineed++) {
      pbc_any[nswap] = 0;
      pbc_flagx[nswap] = 0;
      pbc_flagy[nswap] = 0;
      pbc_flagz[nswap] = 0;

      if(ineed % 2 == 0) {
        sendproc[nswap] = procneigh[idim][0];
        recvproc[nswap] = procneigh[idim][1];
        nbox = myloc[idim] + ineed / 2;
        lo = nbox * prd[idim] / procgrid[idim];

        if(idim == 0) hi = atom.box.xlo + cutneigh;

        if(idim == 1) hi = atom.box.ylo + cutneigh;

        if(idim == 2) hi = atom.box.zlo + cutneigh;

        hi = MIN(hi, (nbox + 1) * prd[idim] / procgrid[idim]);

        if(myloc[idim] == 0) {
          pbc_any[nswap] = 1;

          if(idim == 0) pbc_flagx[nswap] = 1;

          if(idim == 1) pbc_flagy[nswap] = 1;

          if(idim == 2) pbc_flagz[nswap] = 1;
        }
      } else {
        sendproc[nswap] = procneigh[idim][1];
        recvproc[nswap] = procneigh[idim][0];
        nbox = myloc[idim] - ineed / 2;
        hi = (nbox + 1) * prd[idim] / procgrid[idim];

        if(idim == 0) lo = atom.box.xhi - cutneigh;

        if(idim == 1) lo = atom.box.yhi - cutneigh;

        if(idim == 2) lo = atom.box.zhi - cutneigh;

        lo = MAX(lo, nbox * prd[idim] / procgrid[idim]);

        if(myloc[idim] == procgrid[idim] - 1) {
          pbc_any[nswap] = 1;

          if(idim == 0) pbc_flagx[nswap] = -1;

          if(idim == 1) pbc_flagy[nswap] = -1;

          if(idim == 2) pbc_flagz[nswap] = -1;
        }
      }

      slablo[nswap] = lo;
      slabhi[nswap] = hi;
      nswap++;
    }
  }

  return 0;
}
예제 #18
0
파일: life.c 프로젝트: mmallett/cpre426
int main(int argc, char ** argv){
		
	MPI_Init(&argc, &argv);

	//get command line arguments //CHANGE TO MATCH ./life in.file out.file eventually
	if(argc != 3){
			printf("USAGE: lifegrid m n\n");
			MPI_Finalize();
			exit(EXIT_FAILURE);
	}
	sscanf(argv[1], "%d", &m);
	sscanf(argv[2], "%d", &n);

	//in_file = (char*) malloc(strlen(argv[3]) * sizeof(char));
	//out_file = (char*) malloc(strlen(argv[4]) * sizeof(char));

	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
	MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
	
	comm_grid_dims[0] = comm_grid_dims[1] = sqrt(world_size);

	int periods[] = {0,0};
	int reorder = 0;

	MPI_Cart_create(
			MPI_COMM_WORLD,
			DIMENSIONS,
			comm_grid_dims,
			periods,
			reorder,
			&grid_comm);
			
	int my_coords[2];
	MPI_Cart_get(
		grid_comm,
		DIMENSIONS,
		comm_grid_dims,
		periods,
		my_coords
	);

	//initialize a grid with some crap in it
	//over commit memory, provide a padding of one row/column extra
	//around the outside of the array
	current_generation_grid = (int*) calloc((m+2) * (n+2) , sizeof(int));
	int i;
	int write_loc = n + 1; //skip to end of first row
	for(i=0; i < m*n; i++){
		if(i % n == 0){
			write_loc += 2;
		}
		current_generation_grid[write_loc++] = i + world_rank * 100;
	}

	//REPLACE WITH FILE READ EVENTUALLY

	next_generation_grid = (int*) calloc((m+2) * (n+2), sizeof(int));
	
	create_up_communication();
	create_down_communication();
	init_column_t();
	create_left_communication();
	create_right_communication();
	
	int current_generation;
	for(current_generation = 0; current_generation<generations; current_generation++){
		simulate_generation();
	}

	//the game.. is over
	MPI_Datatype final_grid_t;
	MPI_Type_vector(
		m,
		n,
		n+2,
		MPI_INT,
		&final_grid_t
	);
	MPI_Type_commit(&final_grid_t);

	if(world_rank == 0){
		int ** result_grid = (int**) malloc(world_size * sizeof(int*));
		MPI_Request result_reqs[world_size-1];
		int z;
		for(z=1; z<world_size; z++){
			result_grid[z] = (int*) malloc(m * n * sizeof(int));
			MPI_Irecv(
				result_grid[z],
				m * n,
				MPI_INT,
				z,
				TAG,
				grid_comm,
				&result_reqs[z-1]
			);		
		}
		result_grid[0] = (int*) malloc(m * n * sizeof(int));
		for(z=0; z<m; z++){
			memcpy(&result_grid[0][n * z], &grid[(n + 3) + z * (n + 2)], n * sizeof(int));
		}
		MPI_Waitall(world_size-1, result_reqs, MPI_STATUSES_IGNORE);

		printf("FINAL BOARD:\n\n");
		for(z=0; z<dims[0]; z++){
			int y;
			int row;
			for(row = 0; row < m; row++){
				for(y=0; y<dims[1]; y++){
					int x;
					for(x=0; x<n; x++){
						printf("%4d ",
							result_grid[z * dims[1] + y][row * n + x]);
					}
				}		
				printf("\n");
			}
		}		
	}
	else{
		MPI_Request result_req;
		MPI_Isend(
			&grid[n+3],
			1,
			final_grid_t,
			0,
			TAG,
			grid_comm,
			&result_req
		);
		MPI_Wait(&result_req, MPI_STATUSES_IGNORE);
	}

        MPI_Finalize();

        return 0;

}
예제 #19
0
/**
 * Creates a 3D C2C parallel FFT plan. If data_out point to the same location as the input
 * data, then an inplace plan will be created. Otherwise the plan would be outplace.
 * @param n Integer array of size 3, corresponding to the global data size
 * @param data Input data in spatial domain
 * @param data_out Output data in frequency domain
 * @param c_comm Cartesian communicator returned by \ref accfft_create_comm
 * @param flags AccFFT flags, See \ref flags for more details.
 * @return
 */
accfft_plan_gpu*  accfft_plan_dft_3d_c2c_gpu(int * n, Complex * data_d, Complex * data_out_d, MPI_Comm c_comm,unsigned flags){
  accfft_plan_gpu *plan=new accfft_plan_gpu;
  int nprocs, procid;
  MPI_Comm_rank(c_comm, &procid);
  plan->procid=procid;
  MPI_Cart_get(c_comm,2,plan->np,plan->periods,plan->coord);
  plan->c_comm=c_comm;
  int *coord=plan->coord;
  MPI_Comm_split(c_comm,coord[0],coord[1],&plan->row_comm);
  MPI_Comm_split(c_comm,coord[1],coord[0],&plan->col_comm);
  plan->N[0]=n[0];plan->N[1]=n[1];plan->N[2]=n[2];
  int NX=n[0], NY=n[1], NZ=n[2];
  cufftResult_t cufft_error;

  plan->data_c=data_d;
  plan->data_out_c=data_out_d;
  if(data_out_d==data_d){
    plan->inplace=true;}
  else{plan->inplace=false;}

  if(plan->np[1]==1)
    plan->oneD=true;
  else
    plan->oneD=false;



  int *osize_0 =plan->osize_0, *ostart_0 =plan->ostart_0;
  int *osize_1 =plan->osize_1, *ostart_1 =plan->ostart_1;
  int *osize_2 =plan->osize_2, *ostart_2 =plan->ostart_2;
  int *osize_1i=plan->osize_1i,*ostart_1i=plan->ostart_1i;
  int *osize_2i=plan->osize_2i,*ostart_2i=plan->ostart_2i;

  int alloc_local;
  int alloc_max=0,n_tuples=n[2]*2;

  //int isize[3],osize[3],istart[3],ostart[3];
  alloc_max=accfft_local_size_dft_c2c_gpu(n,plan->isize,plan->istart,plan->osize,plan->ostart,c_comm);
  plan->alloc_max=alloc_max;

  dfft_get_local_size_gpu(n[0],n[1],n[2],osize_0,ostart_0,c_comm);
  dfft_get_local_size_gpu(n[0],n[2],n[1],osize_1,ostart_1,c_comm);
  dfft_get_local_size_gpu(n[1],n[2],n[0],osize_2,ostart_2,c_comm);


  std::swap(osize_1[1],osize_1[2]);
  std::swap(ostart_1[1],ostart_1[2]);

  std::swap(ostart_2[1],ostart_2[2]);
  std::swap(ostart_2[0],ostart_2[1]);
  std::swap(osize_2[1],osize_2[2]);
  std::swap(osize_2[0],osize_2[1]);

  for(int i=0;i<3;i++){
    osize_1i[i]=osize_1[i];
    osize_2i[i]=osize_2[i];
    ostart_1i[i]=ostart_1[i];
    ostart_2i[i]=ostart_2[i];
  }


  // fplan_0
  {
    int f_inembed[1]={NZ};
    int f_onembed[1]={NZ};
    int idist=(NZ);
    int odist=(NZ);
    int istride=1;
    int ostride=1;
    int batch=osize_0[0]*osize_0[1];//NX;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_0, 1, &n[2],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_0 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_1
  {
    int f_inembed[1]={NY};
    int f_onembed[1]={NY};
    int idist=1;
    int odist=1;
    int istride=osize_1[2];
    int ostride=osize_1[2];
    int batch=osize_1[2];

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_1, 1, &n[1],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_1 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }
  // fplan_2
  {
    int f_inembed[1]={NX};
    int f_onembed[1]={NX};
    int idist=1;
    int odist=1;
    int istride=osize_2[1]*osize_2[2];
    int ostride=osize_2[1]*osize_2[2];
    int batch=osize_2[1]*osize_2[2];;

    if(batch!=0)
    {
      cufft_error=cufftPlanMany(&plan->fplan_2, 1, &n[0],
          f_inembed, istride, idist, // *inembed, istride, idist
          f_onembed, ostride, odist, // *onembed, ostride, odist
          CUFFT_Z2Z, batch);
      if(cufft_error!= CUFFT_SUCCESS)
      {
        fprintf(stderr, "CUFFT error: fplan_2 creation failed %d \n",cufft_error); return NULL;
      }
      //cufftSetCompatibilityMode(fplan,CUFFT_COMPATIBILITY_FFTW_PADDING); if (cudaGetLastError() != cudaSuccess){fprintf(stderr, "Cuda error:Failed at fplan cuda compatibility\n"); return;}
    }
  }

  // 1D Decomposition
  if (plan->oneD){
    int NX=n[0],NY=n[1],NZ=n[2];


    plan->alloc_max=alloc_max;

    plan->Mem_mgr= new Mem_Mgr_gpu <double>(NX,NY,(NZ)*2,c_comm);
    plan->T_plan_2= new T_Plan_gpu <double>(NX,NY,(NZ)*2, plan->Mem_mgr,c_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(NY,NX,NZ*2, plan->Mem_mgr,c_comm);

    plan->T_plan_2->alloc_local=alloc_max;
    plan->T_plan_2i->alloc_local=alloc_max;
    plan->T_plan_1=NULL;
    plan->T_plan_1i=NULL;




    if(flags==ACCFFT_MEASURE){
      plan->T_plan_2->which_fast_method_gpu(plan->T_plan_2,(double*)data_out_d);

    }
    else{
      plan->T_plan_2->method=2;
      plan->T_plan_2->kway=2;
    }
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);

    plan->T_plan_2i->method=-plan->T_plan_2->method;
    plan->T_plan_2i->kway=plan->T_plan_2->kway;
    plan->T_plan_2i->kway_async=plan->T_plan_2->kway_async;


  }// end 1d c2c

  // 2D Decomposition
  if (!plan->oneD){
    // the reaseon for n_tuples/2 is to avoid splitting of imag and real parts of complex numbers
    plan->Mem_mgr=  new Mem_Mgr_gpu<double>(n[1],n[2],2,plan->row_comm,osize_0[0],alloc_max);
    plan->T_plan_1= new  T_Plan_gpu<double>(n[1],n[2],2, plan->Mem_mgr, plan->row_comm,osize_0[0]);
    plan->T_plan_2= new  T_Plan_gpu<double>(n[0],n[1],2*osize_2[2], plan->Mem_mgr, plan->col_comm);
    plan->T_plan_2i= new T_Plan_gpu<double>(n[1],n[0],2*osize_2i[2], plan->Mem_mgr, plan->col_comm);
    plan->T_plan_1i= new T_Plan_gpu<double>(n[2],n[1],2, plan->Mem_mgr, plan->row_comm,osize_1i[0]);

    plan->T_plan_1->alloc_local=plan->alloc_max;
    plan->T_plan_2->alloc_local=plan->alloc_max;
    plan->T_plan_2i->alloc_local=plan->alloc_max;
    plan->T_plan_1i->alloc_local=plan->alloc_max;


    plan->iplan_0=NULL;
    plan->iplan_1=NULL;
    plan->iplan_2=NULL;

    int coords[2],np[2],periods[2];
    MPI_Cart_get(c_comm,2,np,periods,coords);

    if(flags==ACCFFT_MEASURE){
      if(coords[0]==0){
        plan->T_plan_1->which_fast_method_gpu(plan->T_plan_1,(double*)data_out_d,osize_0[0]);
      }
    }
    else{
      plan->T_plan_1->method=2;
      plan->T_plan_1->kway=2;
    }

    MPI_Bcast(&plan->T_plan_1->method,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway,1, MPI_INT,0, c_comm );
    MPI_Bcast(&plan->T_plan_1->kway_async,1, MPI::BOOL,0, c_comm );
    checkCuda_accfft (cudaDeviceSynchronize());
    MPI_Barrier(plan->c_comm);


    plan->T_plan_1->method =plan->T_plan_1->method;
    plan->T_plan_2->method =plan->T_plan_1->method;
    plan->T_plan_2i->method=-plan->T_plan_1->method;
    plan->T_plan_1i->method=-plan->T_plan_1->method;

    plan->T_plan_1->kway =plan->T_plan_1->kway;
    plan->T_plan_2->kway =plan->T_plan_1->kway;
    plan->T_plan_2i->kway=plan->T_plan_1->kway;
    plan->T_plan_1i->kway=plan->T_plan_1->kway;

    plan->T_plan_1->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2->kway_async =plan->T_plan_1->kway_async;
    plan->T_plan_2i->kway_async=plan->T_plan_1->kway_async;
    plan->T_plan_1i->kway_async=plan->T_plan_1->kway_async;


  }// end 2d c2c

  plan->c2c_plan_baked=true;
  return plan;
}// end accfft_plan_dft_c2c_gpu