示例#1
0
文件: grid.c 项目: Thundzz/TDP
void create_grid(int myrank, int gd,
	MPI_Comm* comm_grid, MPI_Comm* comm_row, MPI_Comm* comm_col)
{
	int dims[2] = {gd, gd};
	int coords[2]; // coords[0] = i, coords[1] = j
	int periods[2];
	int reorder;

	int grid_rank;
	int subdivision[2];

	periods[0] = 0 ; 
	periods[1] = 1 ;
	reorder = 1 ;
	MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, reorder, comm_grid);

	MPI_Cart_coords(*comm_grid, myrank, 2, coords); //Outputs the i,j coordinates of the process
	MPI_Cart_rank(*comm_grid, coords, &grid_rank);  //Outputs the rank of the process

	subdivision[0] = 1;
	subdivision[1] = 0;
 	MPI_Cart_sub (*comm_grid,subdivision,comm_col); // Communicator between lines
 	subdivision[0] = 0;
 	subdivision[1] = 1; 
 	MPI_Cart_sub (*comm_grid,subdivision,comm_row); // Communicator between row
}
示例#2
0
文件: fox.c 项目: The-coders/fox
void grid_setup(struct grid_info *grid) {
    /* obtener datos globales */
    MPI_Comm_size(MPI_COMM_WORLD, &(grid->nr_world_processes));
    MPI_Comm_rank(MPI_COMM_WORLD, &(grid->my_world_rank));

    /* calcular cuantos procesos por lado tendra la grilla */
    grid->ppside = intsqrt(grid->nr_world_processes);

    /* crear comunicador para topologia de grilla */
    int dimensions[2]  = {grid->ppside, grid->ppside};
    int wrap_around[2] = {TRUE, TRUE};
    int reorder = TRUE;
    MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, reorder, &(grid->comm));
    MPI_Comm_rank(grid->comm, &(grid->my_rank));

    /* obtener coordenadas grillisticas del proceso */
    int coordinates[2];
    MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates);
    grid->my_row = coordinates[0];
    grid->my_col = coordinates[1];

    /* obtener comunicadores para la fila y la columna del proceso */
    int free_coords_for_rows[] = {FALSE, TRUE};
    int free_coords_for_cols[] = {TRUE, FALSE};
    MPI_Cart_sub(grid->comm, free_coords_for_rows, &(grid->row_comm));
    MPI_Cart_sub(grid->comm, free_coords_for_cols, &(grid->col_comm));
}
示例#3
0
void initialiseMonde(int argc, char** argv)
{
	int remain[2], periods[2], reorder, i, nb_thread;

	MPI_Init (&argc, &argv);      				  /* starts MPI */
	MPI_Comm_rank (MPI_COMM_WORLD, &rank);        /* get current process id */
	MPI_Comm_size (MPI_COMM_WORLD, &size);        /* get number of processes */

	// On vérifie qu'il existe au moins un deuxieme argument 
	if(argc < 3)
	{
		MPI_Finalize();
		if(rank == 0)
			printf("Le nombre d'arguments n'est pas suffisant\nAssurez vous de préciser en premier argument le nombre de thread puis la taille de matrice");
		exit(1);
	}
	nb_thread = atoi(argv[1]);
	omp_set_num_threads(nb_thread);

	taille_matrice = atoi(argv[2]);

	racine = sqrt(size);
	if (racine * racine != size || taille_matrice % racine != 0)
	{
		MPI_Finalize();
		if(rank == 0)
			printf("Le nombre de processus MPI n'est pas cohérent avec la taille de la matrice\n");
		exit(1);
	}

	taille_block = taille_matrice / racine;

	tab_dim[LIGNE]   = racine;
	tab_dim[COLONNE] = racine;
	periods[LIGNE]   = 0;
	periods[COLONNE] = 0;
	reorder = 0;

	MPI_Cart_create(MPI_COMM_WORLD, 2, tab_dim, periods, reorder, &COMM_CART);
	// Récupération du rang dans le communicateur cartésien
	MPI_Comm_rank(COMM_CART, &rankCart);

	// Récupération des coordonnées dans le communicateur cartésien
	MPI_Cart_coords(COMM_CART, rankCart, 2, coords);

	// Création du communicateur en ligne
	remain[LIGNE]   = 1;
	remain[COLONNE] = 0;
	MPI_Cart_sub(COMM_CART, remain, &COMM_ROWS);
	// Récupération du rang dans le communicateur en ligne
	MPI_Comm_rank(COMM_ROWS, &rankRow);

	// Création du communicateur en colonne
	remain[LIGNE]   = 0;
	remain[COLONNE] = 1;
	MPI_Cart_sub(COMM_CART, remain, &COMM_COLS);
	// Récupération du rang dans le communicateur en colonne
	MPI_Comm_rank(COMM_COLS, &rankCol);
}
示例#4
0
文件: grid.c 项目: Moeryn/PRCD-TP2
void create_grid(GRID_INFO * grid){
  int old_rank;
  int size;
  MPI_Comm_rank(MPI_COMM_WORLD,&old_rank);
  
#ifdef DEBUG_MODE
  if(old_rank == 0) {   DEBUG("Creating grid ...") }
#endif
  
  /* Setting up order of grid */
  MPI_Comm_size(MPI_COMM_WORLD,&size);
  grid->processes = size;
  grid->grid_order = sqrt(size);
  
#ifdef DEBUG_MODE
  if(old_rank == 0){ DEBUGA(" Order %d",grid->grid_order)};
#endif
  
  int dimensions[2] = {grid->grid_order,grid->grid_order};
  int periods[2] = {1,1};

  /* Creating the grid */
  MPI_Cart_create(MPI_COMM_WORLD,2,dimensions,periods,1, &(grid->grid_comm));


  /* Get rank in the grid */
  MPI_Comm_rank(grid->grid_comm,&(grid->rank));

  /* Get coordinates in the grid */
  int coord[2];
  MPI_Cart_coords(grid->grid_comm,grid->rank, 2, coord);
  grid->row = coord[0];
  grid->col = coord[1];

#ifdef DEBUG_MODE
  if(old_rank == 0) {  DEBUG(" Creating row communicator") }
#endif


  /* Creating row communicator */
  int variable_coord[2] = {0,1};
  MPI_Cart_sub(grid->grid_comm,variable_coord,&(grid->row_comm));


#ifdef DEBUG_MODE
  if(old_rank == 0) {  DEBUG(" Creating col communicator") }
#endif
    
  /* Creating column communicator */
  variable_coord[0] = 1;
  variable_coord[1] = 0;
  MPI_Cart_sub(grid->grid_comm,variable_coord,&(grid->col_comm));
    

#ifdef DEBUG_MODE
  if(old_rank == 0) {  DEBUG("Grid created.") }
#endif
}
示例#5
0
void split_communicator(MPI_Comm comm, MPI_Comm cart[], int P[]) {
    int wrap[]= {0,0};
    int coor[2];
    MPI_Comm gcart;
    MPI_Cart_create(comm,2,P,wrap,1,&gcart); //parameter 4: value 1: reorder
    MPI_Cart_get(gcart,2,P,wrap,coor);
    int rdim1[] = {0,1}, rdim2[] = {1,0};
    MPI_Cart_sub(gcart, rdim1 , &cart[0]);
    MPI_Cart_sub(gcart, rdim2 , &cart[1]);
}
示例#6
0
文件: summa.cpp 项目: danfergo/cpar
void summa(MPI_Comm comm_cart, const int m_block, const int n_block, const int k_block, double A_local[], double B_local[], double C_local[]) {
    // determine my cart coords
    int coords[2];
    MPI_Cart_coords(comm_cart, rank, 2, coords);

    const int my_row = coords[0];
    const int my_col = coords[1];

    int belongs[2];

    // create row comms for A
    MPI_Comm row_comm;
    belongs[0] = 0;
    belongs[1] = 1;
    MPI_Cart_sub(comm_cart, belongs, &row_comm);

    // create col comms for B
    MPI_Comm col_comm;
    belongs[0] = 1;
    belongs[1] = 0;
    MPI_Cart_sub(comm_cart, belongs, &col_comm);

    /*int row_rank, col_rank;
    MPI_Comm_rank(row_comm, &row_rank);
    MPI_Comm_rank(col_comm, &col_rank);
    if(rank == 1) std::cout << "Rank: " << rank << "-->(" << my_col << "," << my_row << ") (" << col_rank << "," << row_rank << ")" << std::endl;
    //printf("Rank: %i-->(%i,%i)|(%i,%i)\n", rank, my_col, my_row, col_rank, row_rank);*/

    double * A_saved = (double *) calloc(m_block * n_block, sizeof(double));
    double * B_saved = (double *) calloc(n_block * k_block, sizeof(double));
    double * C_tmp = (double *) calloc(m_block * k_block, sizeof(double));

    memcpy(A_saved, A_local, m_block * n_block * sizeof(double));
    memcpy(B_saved, B_local, n_block * k_block * sizeof(double));

    int number_blocks = n / n_block;
    for(int broadcaster = 0; broadcaster < number_blocks; ++broadcaster){
        if (my_col == broadcaster) {
            memcpy(A_local, A_saved, m_block * n_block * sizeof(double));
        }

        MPI_Bcast(A_local, m_block * n_block, MPI_DOUBLE, broadcaster, row_comm);

        if (my_row == broadcaster) {
            memcpy(B_local, B_saved, n_block * k_block * sizeof(double));
        }

        MPI_Bcast(B_local, n_block * k_block, MPI_DOUBLE, broadcaster, col_comm);

        multMatricesLineByLine(m_block, n_block, k_block, A_local, B_local, C_tmp);

        sumMatrix(m_block, n_block, C_local, C_tmp, C_local);
    }
}
示例#7
0
  //define the cartesian grid
  void create_MPI_cartesian_grid()
  {
#ifdef USE_MPI
    coords periods;
    for(int mu=0;mu<NDIM;mu++) periods[mu]=1;
    MPI_Cart_create(MPI_COMM_WORLD,NDIM,nrank_dir,periods,1,&cart_comm);
    //takes rank and ccord of local rank
    MPI_Comm_rank(cart_comm,&cart_rank);
    MPI_Cart_coords(cart_comm,cart_rank,NDIM,rank_coord);
    
    //create communicator along plan
    for(int mu=0;mu<NDIM;mu++)
      {
	coords split_plan;
	coords proj_rank_coord;
	for(int nu=0;nu<NDIM;nu++)
	  {
	    split_plan[nu]=(nu==mu) ? 0 : 1;
	    proj_rank_coord[nu]=(nu==mu) ? 0 : rank_coord[nu];
	  }
	MPI_Cart_sub(cart_comm,split_plan,&(plan_comm[mu]));
	MPI_Comm_rank(plan_comm[mu],&(plan_rank[mu]));
	if(plan_rank[mu]!=rank_of_coord(proj_rank_coord))
	  crash("Plan communicator has messed up coord: %d and rank %d (implement reorder!)",
		rank_of_coord(proj_rank_coord),plan_rank[mu]);
      }
    
    //create communicator along line
    for(int mu=0;mu<NDIM;mu++)
      {
	//split the communicator
	coords split_line;
	memset(split_line,0,sizeof(coords));
	split_line[mu]=1;
	MPI_Cart_sub(cart_comm,split_line,&(line_comm[mu]));
	
	//get rank id
	MPI_Comm_rank(line_comm[mu],&(line_rank[mu]));
	
	//get rank coord along line comm
	MPI_Cart_coords(line_comm[mu],line_rank[mu],1,&(line_coord_rank[mu]));
	
	//check communicator
	if(line_rank[mu]!=rank_coord[mu] || line_rank[mu]!=line_coord_rank[mu])
	  crash("Line communicator has messed up coord and rank (implement reorder!)");
      }
#else
    cart_rank=plan_rank=line_rank=0;
    for(int mu=0;mu<NDIM;mu++) rank_coord[mu]=planline_coord[mu]=0;
#endif
  }
示例#8
0
文件: mpla.cpp 项目: zaspel/MPLA
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance)
{
	// allocate redistributed vector
	struct mpla_vector x_redist;
	mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count);

	// redistribute input vector with row-block parallel distribution to column-block parallel distribution
	mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance);
	
	// generic computation core: matrix-vector product
	mpla_dgemv_core(b, A, &x_redist, instance);

	// create sub-communicator for each process row
	int remain_dims[2];
	remain_dims[0]=0;
	remain_dims[1]=1;
	MPI_Comm row_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &row_comm);

	// summation of block row results
	double* sum;
	cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm);
	cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// cleanup
	cudaFree(sum);
	mpla_free_vector(&x_redist, instance);

	MPI_Comm_free(&row_comm);
}
示例#9
0
void mpi_cart_sub_f(MPI_Fint *comm, ompi_fortran_logical_t *remain_dims,
                    MPI_Fint *new_comm, MPI_Fint *ierr)
{
    MPI_Comm c_comm, c_new_comm;
    /*
     * Just in the case, when sizeof(logical)!=sizeof(int) and
     * Fortran TRUE-value != 1, we have to convert -- then we need
     * to know the number of dimensions, for the size of remain_dims
     */
#if OMPI_FORTRAN_MUST_CONVERT_LOGICAL_2_INT == 1
    int ndims;
#endif
    OMPI_LOGICAL_ARRAY_NAME_DECL(remain_dims);

    c_comm = MPI_Comm_f2c(*comm);
    c_new_comm = MPI_Comm_f2c(*new_comm);

#if OMPI_FORTRAN_MUST_CONVERT_LOGICAL_2_INT == 1
    *ierr = OMPI_INT_2_FINT(MPI_Cartdim_get(c_comm, &ndims));
    if (MPI_SUCCESS != OMPI_FINT_2_INT(*ierr)) {
        return;
    }
#endif
    OMPI_ARRAY_LOGICAL_2_INT(remain_dims, ndims);

    *ierr = OMPI_INT_2_FINT(MPI_Cart_sub(c_comm,
                              OMPI_LOGICAL_ARRAY_NAME_CONVERT(remain_dims),
                              &c_new_comm));
    if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr)) {
        *new_comm = MPI_Comm_c2f(c_new_comm);
    }

    OMPI_ARRAY_INT_2_LOGICAL(remain_dims, ndims);
}
示例#10
0
文件: mpla.cpp 项目: zaspel/MPLA
void mpla_copy_distributed_vector_to_cpu(double* x_cpu, struct mpla_vector* x, struct mpla_instance* instance)
{
        // create sub-communicator for each process column
        int remain_dims[2];
        remain_dims[0]=1;
        remain_dims[1]=0;
        MPI_Comm column_comm;
        MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
        int column_rank;
        MPI_Comm_rank(column_comm, &column_rank);

        // columnwise creation of the full vector
        double* full_vector = x_cpu;
        int* recvcounts = new int[instance->proc_rows];
        int* displs = new int[instance->proc_rows];
        for (int i=0; i<instance->proc_rows; i++)
        {
                recvcounts[i] = x->proc_row_count[i][instance->cur_proc_col];
                displs[i] = x->proc_row_offset[i][instance->cur_proc_col];
        }
//        cudaMalloc((void**)&full_vector, sizeof(double)*x->vec_row_count);
//        cudaThreadSynchronize();
//        checkCUDAError("cudaMalloc");
        MPI_Allgatherv(x->data, x->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

        // memory cleanup
        MPI_Comm_free(&column_comm);

        MPI_Barrier(instance->comm);
}
示例#11
0
void mpi_manager_2D::get_SubComms() {
	//! Obtain ranks and communicators for 1D

	int remain_dims[2];
	// x-direction:
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(comm2d, remain_dims, &comm_line_x);
	MPI_Comm_rank(comm_line_x, &rank_line_x);

	// y-direction
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	MPI_Cart_sub(comm2d, remain_dims, &comm_line_y);
	MPI_Comm_rank(comm_line_y, &rank_line_y);
}
示例#12
0
void mpif_cart_sub_(MPI_Fint *old_comm, int *belongs, MPI_Fint *new_comm, int *error)
{
  MPI_Comm old_comm_c = MPI_Comm_f2c(*old_comm);
  MPI_Comm new_comm_c;

  *error = MPI_Cart_sub(old_comm_c, belongs, &new_comm_c);
  *new_comm = MPI_Comm_c2f(new_comm_c);
}
示例#13
0
 int main (int argc, char** argv)
 {
    int num_tasks;

    char hostname[80];

    int dims[DIM];
    dims[0] = DIM_0;
    dims[1] = DIM_1;
    dims[2] = DIM_2;

    int periods[DIM] = {false, false, false};
    int reorder = true;
    int my_rank;

    int coords[DIM];

    MPI_Comm cartcomm, y_comm;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_tasks);

    if (num_tasks != SIZE) {
        if (my_rank == 0) {
            printf("We need %d proccesses, %d given. Exiting.\n", SIZE, num_tasks);
        }
        
        MPI_Finalize();

		return 0;
        
    }         
    
    gethostname(hostname, 79);
	MPI_Cart_create(MPI_COMM_WORLD, DIM, dims, periods, reorder, &cartcomm);
	MPI_Cart_coords(cartcomm, my_rank, DIM, coords);
	printf("%-15.12s: MPI_COMM_WORLD rank %2d: (%d, %d, %d)\n", hostname, my_rank, coords[0], coords[1], coords[2]);
	
	//neighbors
	int src, dest;
	for (int i = 0; i < 3; i++) {
		MPI_Cart_shift(cartcomm, i, +1, src, dest);
		printf("i am %d and my right neighbor in %d is %d", dest, i, src);
		MPI_Cart_shift(cartcomm, i, -1, src, dest);	
		printf("i am %d and my left neighbor in %d is %d", dest, i, src);
	}
	
	
	int keep_dims[1];
	keep_dims[0] = 1;
	MPI_Cart_sub(cartcomm, keep_dims, &y_comm);
	printf("%d: my y rank is %d", my_rank, coords[1]);

    MPI_Finalize();

    return 0;
 }
示例#14
0
/**
 *  \brief Given a communicator for a 2D process grid, this routine
 *  returns a new communicator consisting only of the process-grid row
 *  in which the calling process belongs.
 *
 *  Example: If the process grid is 2 x 3, e.g.,
 *
 *     (0, 0)  |  (0, 1)  |  (0, 2)
 *     (1, 0)  |  (1, 1)  |  (1, 2)
 *
 *  and process (1, 1) calls this routine, then the routine will
 *  return a communicator containing the processes, {(1, 0), (1, 1),
 *  (1, 2)}.
 */
static
MPI_Comm
getCommRow__ (MPI_Comm comm2d)
{
  int select[2] = {0, 1};
  MPI_Comm comm_row;
  MPI_Cart_sub (comm2d, select, &comm_row);
  return comm_row;
}
示例#15
0
/**
 *  \brief Given a communicator for a 2D process grid, this routine
 *  returns a new communicator consisting only of the process-grid
 *  column in which the calling process belongs.
 *
 *  Example: If the process grid is 2 x 3, e.g.,
 *
 *     (0, 0)  |  (0, 1)  |  (0, 2)
 *     (1, 0)  |  (1, 1)  |  (1, 2)
 *
 *  and process (1, 1) calls this routine, then the routine will
 *  return a communicator containing the processes, {(0, 0), (1, 1)}.
 */
static
MPI_Comm
getCommCol__ (MPI_Comm comm2d)
{
  int select[2] = {1, 0};
  MPI_Comm comm_col;
  MPI_Cart_sub (comm2d, select, &comm_col);
  return comm_col;
}
示例#16
0
void Setup_grid(
                GRID_INFO_T*  grid  /* out */) {
    int old_rank;
    int dimensions[2];
    int wrap_around[2];
    int coordinates[2];
    int free_coords[2];
    
    /* Set up Global Grid Information */
    MPI_Comm_size(MPI_COMM_WORLD, &(grid->p));
    MPI_Comm_rank(MPI_COMM_WORLD, &old_rank);
    
    /* We assume p is a perfect square */
    grid->q = (int) sqrt((double) grid->p);
    dimensions[0] = dimensions[1] = grid->q;
    
    /* We want a circular shift in second dimension. */
    /* Don't care about first                        */
    wrap_around[0] = wrap_around[1] = 1;
    MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions,
                    wrap_around, 1, &(grid->comm));
    MPI_Comm_rank(grid->comm, &(grid->my_rank));
    MPI_Cart_coords(grid->comm, grid->my_rank, 2,
                    coordinates);
    grid->my_row = coordinates[0];
    grid->my_col = coordinates[1];
    
    /* Set up row communicators */
    free_coords[0] = 0;
    free_coords[1] = 1;
    MPI_Cart_sub(grid->comm, free_coords,
                 &(grid->row_comm));
    
    /* Set up column communicators */
    free_coords[0] = 1;
    free_coords[1] = 0;
    MPI_Cart_sub(grid->comm, free_coords,
                 &(grid->col_comm));
} /* Setup_grid */
void Setup_grid(GRID_INFO_TYPE* grid){
int dimensions[2];
int periods[2];
int varying_coords[2];
MPI_Comm   localname;

MPI_Comm_size(MPI_COMM_WORLD, (&grid->p)); /* get the total number of processes */
grid->q=(int)sqrt((double) grid->p);  /* square grid  */ 
dimensions[0]=dimensions[1]=grid->q;   /* dimension of the grid */
periods[0]=periods[1]=1;  /*  periodic if 1 and non-periodic if 0 */

MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &(grid->comm)); /* create a grid communicator from the "Comm_world" , having dimension 2, reorder the process to processor mapping */

MPI_Comm_rank(grid->comm, &(grid->my_rank));
MPI_Cart_coords(grid->comm, grid->my_rank,2,coordinates);
grid->my_row=coordinates[0];
grid->my_col=coordinates[1];
varying_coords[0]=0;varying_coords[1]=1;
MPI_Cart_sub(grid->comm,varying_coords,&(grid->row_comm));
varying_coords[0]=1;varying_coords[1]=0;
MPI_Cart_sub(grid->comm,varying_coords,&(grid->col_comm));

}
示例#18
0
int main(int argc, char **argv)
{
  MPI_Init(&argc, &argv);

  int i, myrank, numranks, groupsize;
  int dims[3] = {0, 0, 0};          
  int temp[3] = {0, 0, 0};          
  int coord[3] = {0, 0, 0};          
  int periods[3] = {1, 1, 1};
  double startTime, stopTime;

  MPI_Comm cartcomm, subcomm;

  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
  MPI_Comm_size(MPI_COMM_WORLD, &numranks);

  dims[MP_X] = atoi(argv[1]);
  dims[MP_Y] = atoi(argv[2]);
  dims[MP_Z] = atoi(argv[3]);
  MPI_Dims_create(numranks, 3, dims);
  MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods, 1, &cartcomm);
  MPI_Cart_get(cartcomm, 3, dims, periods, coord);
  temp[MP_X] = 0; temp[MP_Y] = 1; temp[MP_Z] = 0;
  MPI_Cart_sub(cartcomm, temp, &subcomm);

  MPI_Comm_size(subcomm,&groupsize);
  int perrank = atoi(argv[4]);
  char *sendbuf = (char*)malloc(perrank*groupsize);
  char *recvbuf = (char*)malloc(perrank*groupsize);

  MPI_Barrier(cartcomm);
  MPI_Pcontrol(1);
  startTime = MPI_Wtime();

  for (i=0; i<MAX_ITER; i++) {
    MPI_Alltoall(sendbuf, perrank, MPI_CHAR, recvbuf, perrank, MPI_CHAR, subcomm);
  }

  MPI_Barrier(cartcomm);
  stopTime = MPI_Wtime();
  MPI_Pcontrol(0);

  if(myrank == 0) {
    printf("Completed %d iterations for subcom size %d, perrank %d\n", i, groupsize, perrank);
    printf("Time elapsed: %f\n", stopTime - startTime);
  }

  MPI_Finalize();
  return 0;
}
示例#19
0
    void TreeCommunicator::comm_create(const MPI_Comm &comm)
    {
        int num_dim = m_fan_out.size();
        int color, key;
        MPI_Comm comm_cart;
        std::vector<int> flags(num_dim);
        std::vector<int> coords(num_dim);
        int rank_cart;

        memset(flags.data(), 0, sizeof(int)*num_dim);
        flags[0] = 1;
        check_mpi(MPI_Cart_create(comm, num_dim, m_fan_out.data(), flags.data(), 1, &comm_cart));
        check_mpi(MPI_Comm_rank(comm_cart, &rank_cart));
        check_mpi(MPI_Cart_coords(comm_cart, rank_cart, num_dim, coords.data()));
        check_mpi(MPI_Cart_sub(comm_cart, flags.data(), &(m_comm[0])));
        for (int i = 1; i < num_dim; ++i) {
            if (coords[i-1] == 0) {
                color = 1;
                key = coords[i];
            }
            else {
                color = MPI_UNDEFINED;
                key = 0;
            }
            check_mpi(MPI_Comm_split(comm_cart, color, key, &(m_comm[i])));
        }
        check_mpi(MPI_Comm_free(&comm_cart));

        m_num_level = 0;
        for (auto comm_it = m_comm.begin();
             comm_it != m_comm.end() && *comm_it != MPI_COMM_NULL;
             ++comm_it) {
            m_num_level++;
        }

        m_comm.resize(m_num_level);

        if (m_global_policy) {
            m_num_level++;
        }

        if (rank_cart == 0 && m_global_policy == NULL) {
            throw Exception("process at root of tree communicator has not mapped the control file", GEOPM_ERROR_CTL_COMM, __FILE__, __LINE__);
        }
        if (rank_cart != 0 && m_global_policy != NULL) {
            throw Exception("process not at root of tree communicator has mapped the control file", GEOPM_ERROR_CTL_COMM, __FILE__, __LINE__);
        }
    }
示例#20
0
文件: mpla.cpp 项目: zaspel/MPLA
void mpla_save_vector(struct mpla_vector* x, char* filename, struct mpla_instance* instance)
{
        // create sub-communicator for each process column
        int remain_dims[2];
        remain_dims[0]=1;
        remain_dims[1]=0;
        MPI_Comm column_comm;
        MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
        int column_rank;
        MPI_Comm_rank(column_comm, &column_rank);

        // columnwise creation of the full vector
        double* full_vector;
        int* recvcounts = new int[instance->proc_rows];
        int* displs = new int[instance->proc_rows];
        for (int i=0; i<instance->proc_rows; i++)
        {
                recvcounts[i] = x->proc_row_count[i][instance->cur_proc_col];
                displs[i] = x->proc_row_offset[i][instance->cur_proc_col];
        }
        cudaMalloc((void**)&full_vector, sizeof(double)*x->vec_row_count);
        cudaThreadSynchronize();
        checkCUDAError("cudaMalloc");
        MPI_Allgatherv(x->data, x->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

	// writing full vector to file on parent process	
	if (instance->is_parent)
	{
		FILE* f = fopen(filename, "wb");

		double* full_vector_host = new double[x->vec_row_count];
		cudaMemcpy(full_vector_host, full_vector, x->vec_row_count*sizeof(double), cudaMemcpyDeviceToHost);
	
		fwrite(&(x->vec_row_count), sizeof(int), 1, f);
			
		fwrite(full_vector_host, sizeof(double), x->vec_row_count, f);

		fclose(f);	

		delete [] full_vector_host;
	}

        // memory cleanup
        cudaFree(full_vector);
	MPI_Comm_free(&column_comm);

	MPI_Barrier(instance->comm);
}
示例#21
0
static void extract_comm_1d(
    int dim, MPI_Comm comm_cart,
    MPI_Comm *comm_1d
    )
{
  int ndims, *remain_dims;
  MPI_Cartdim_get(comm_cart, &ndims);
  
  remain_dims = (int *) malloc(sizeof(int) * (size_t) ndims);
  for(int t=0; t<ndims; t++)
    remain_dims[t] = (t==dim) ? 1 : 0;

  MPI_Cart_sub(comm_cart, remain_dims, comm_1d);
  
  free(remain_dims);
}
示例#22
0
文件: mpla.cpp 项目: zaspel/MPLA
void mpla_ddot(double* xy, struct mpla_vector* x, struct mpla_vector* y, struct mpla_instance* instance)
{
	// compute process-wise dot product
	double xy_tmp;
	cublasDdot(instance->cublas_handle, x->cur_proc_row_count, x->data, 1, y->data, 1, &xy_tmp);

	// create sub-communicator for each process column
	int remain_dims[2];
	remain_dims[0]=1;
	remain_dims[1]=0;
	MPI_Comm column_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &column_comm);

	// parallel summation and communication
	MPI_Allreduce(&xy_tmp, xy, 1, MPI_DOUBLE, MPI_SUM, column_comm);

	MPI_Comm_free(&column_comm);
}
示例#23
0
FORT_DLL_SPEC void FORT_CALL mpi_cart_sub_ ( MPI_Fint *v1, MPI_Fint v2[], MPI_Fint *v3, MPI_Fint *ierr ){
    int _ctsize;
    int *l2=0;
    {int _topotype;
    PMPI_Topo_test( (MPI_Comm)*v1, &_topotype );
    if (_topotype != MPI_CART) {
        _ctsize = 0;
    }
    else 
        PMPI_Cartdim_get( (MPI_Comm)*v1, &_ctsize );
    }

    if (_ctsize) {int li;
     l2 = (int *)MPL_malloc(_ctsize * sizeof(int), MPL_MEM_OTHER);
     for (li=0; li<_ctsize; li++) {
        l2[li] = MPII_FROM_FLOG(v2[li]);
     }
    }
    *ierr = MPI_Cart_sub( (MPI_Comm)(*v1), l2, (MPI_Comm *)(v3) );
    if (l2) { MPL_free( l2 ); }
}
示例#24
0
文件: mpla.cpp 项目: zaspel/MPLA
void mpla_redistribute_vector_for_generic_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_instance* instance)
{
	// attention: this code does no correctness check for the input data


	// WARNING: The following code is not efficient for a strong parallelization !!!!!


	// create sub-communicator for each process column
	int remain_dims[2];
	remain_dims[0]=1;
	remain_dims[1]=0;
	MPI_Comm column_comm;
	MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
	int column_rank;
	MPI_Comm_rank(column_comm, &column_rank);
	
	// columnwise creation of the full vector
	double* full_vector;
	int* recvcounts = new int[instance->proc_rows];
	int* displs = new int[instance->proc_rows];
	for (int i=0; i<instance->proc_rows; i++)
	{
		recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col];
		displs[i] = b->proc_row_offset[i][instance->cur_proc_col];
	}
	cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count);
	cudaThreadSynchronize();
	checkCUDAError("cudaMalloc");
	MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);

	// extract column-wise local part of full vector
	cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice);

	// memory cleanup
	cudaFree(full_vector);


	MPI_Comm_free(&column_comm);
}
int main (int argc, char **argv) {
	FILE *fp;
	double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL;
	double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL;
	int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size;
	int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size;
	int rank, size, sqrt_size, matrices_a_b_dimensions[4];
	MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator;
	MPI_Status status; 
	MPI_Request request1,request2;

	// used to manage the cartesian grid
	int dimensions[2], periods[2], coordinates[2], remain_dims[2];

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* For square mesh */
	sqrt_size = (int)sqrt((double) size);             
	if(sqrt_size * sqrt_size != size){
		if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n");
		MPI_Abort(MPI_COMM_WORLD, -1);
	}

	// create a 2D cartesian grid 
	dimensions[0] = dimensions[1] = sqrt_size;
	periods[0] = periods[1] = 1;    
	MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator);
	MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates);

	// create a row communicator
	remain_dims[0] = 0;            
	remain_dims[1] = 1; 
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator);

	// create a column communicator
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator);

	// getting matrices from files at rank 0 only
	// example: mpiexec -n 64 ./cannon matrix1 matrix2 [test]
	if (rank == 0){
		int row, column;
		if ((fp = fopen (argv[1], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]);
			A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *));
			for (row = 0; row < matrices_a_b_dimensions[0]; row++){
				A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double));
				for (column = 0; column < matrices_a_b_dimensions[1]; column++)
					fscanf(fp, "%lf", &A[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
		if((fp = fopen (argv[2], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]);
			B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *));
			for(row = 0; row < matrices_a_b_dimensions[2]; row++){
				B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *));
				for(column = 0; column < matrices_a_b_dimensions[3]; column++)
					fscanf(fp, "%lf", &B[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// need to check that the multiplication is possible given dimensions 
		// matrices_a_b_dimensions[0] = row size of A
		// matrices_a_b_dimensions[1] = column size of A
		// matrices_a_b_dimensions[2] = row size of B
		// matrices_a_b_dimensions[3] = column size of B
		if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){
			if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", 
					matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// this implementation is limited to cases where thematrices can be partitioned perfectly
		if( matrices_a_b_dimensions[0] % sqrt_size != 0 
				|| matrices_a_b_dimensions[1] % sqrt_size != 0 
				|| matrices_a_b_dimensions[2] % sqrt_size != 0 
				|| matrices_a_b_dimensions[3] % sqrt_size != 0 ){
			if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n"
					"all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n",
					matrices_a_b_dimensions[0],matrices_a_b_dimensions[1],
					matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size );
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
	}

	// send dimensions to all peers
	if(rank == 0) {
		int i;
		for(i = 1; i < size; i++){
			MPI_Send(matrices_a_b_dimensions, 4, MPI_INT, i, 0, cartesian_grid_communicator);
		}
	} else {
		MPI_Recv(matrices_a_b_dimensions, 4, MPI_INT, 0, 0, cartesian_grid_communicator, &status);
	}

	A_rows = matrices_a_b_dimensions[0];
	A_columns = matrices_a_b_dimensions[1];
	B_rows = matrices_a_b_dimensions[2];
	B_columns = matrices_a_b_dimensions[3];

	// local metadata for A
	A_local_block_rows = A_rows / sqrt_size;
	A_local_block_columns = A_columns / sqrt_size;
	A_local_block_size = A_local_block_rows * A_local_block_columns;
	A_local_block = (double *) malloc (A_local_block_size * sizeof(double));

	// local metadata for B
	B_local_block_rows = B_rows / sqrt_size;
	B_local_block_columns = B_columns / sqrt_size;
	B_local_block_size = B_local_block_rows * B_local_block_columns;
	B_local_block = (double *) malloc (B_local_block_size * sizeof(double));

	// local metadata for C
	C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double));
	// C needs to be initialized at 0 (accumulates partial dot-products)
	int i;
	for(i=0; i < A_local_block_rows * B_local_block_columns; i++){
		C_local_block[i] = 0;
	}

	// full arrays only needed at root
	if(rank == 0){
		A_array = (double *) malloc(sizeof(double) * A_rows * A_columns);
		B_array = (double *) malloc(sizeof(double) * B_rows * B_columns);
		C_array = (double *) malloc(sizeof(double) * A_rows * B_columns);
		// generate the 1D arrays of the matrices at root
		int row, column, i, j;
		for (i = 0; i < sqrt_size; i++){
			for (j = 0; j < sqrt_size; j++){
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < A_local_block_columns; column++){
						A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] 
							= A[i * A_local_block_rows + row][j * A_local_block_columns + column];
					}
				}
				for (row = 0; row < B_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] 
							= B[i * B_local_block_rows + row][j * B_local_block_columns + column];
					}
				}
			}
		}
		// allocate output matrix C
		C = (double **) malloc(A_rows * sizeof(double *));
		for(i=0; i<A_rows ;i++){
			C[i] = (double *) malloc(B_columns * sizeof(double));
		}
	} 
	
	// send a block to each process
	if(rank == 0) {
		int i;
		for(i = 1; i < size; i++){
			MPI_Send((A_array + (i * A_local_block_size)), A_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator);
			MPI_Send((B_array + (i * B_local_block_size)), B_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator);
		}
		for(i = 0; i < A_local_block_size; i++){
			A_local_block[i] = A_array[i];
		}
		for(i = 0; i < B_local_block_size; i++){
			B_local_block[i] = B_array[i];
		}
	} else {
		MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status);
		MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status);
	}


	//for(int r=0;r<size;r++){
	//	if(rank==15){
	//	int i;
	//	for(i = 0; i < A_local_block_rows*B_local_block_columns;i++) {
	//
//			        printf ("%7.3f ", A_local_block[i]);

//			}
//			}
	//}
//	MPI_Barrier(cartesian_grid_communicator);
	// cannon's algorithm
	int cannon_block_cycle;
	double compute_time = 0, mpi_time = 0, start;
	int C_index, A_row, A_column, B_column;
	for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){
		//Asynchronus Send!
		start = MPI_Wtime();
		MPI_Isend(A_local_block, A_local_block_size, MPI_DOUBLE, 
				(coordinates[1] + sqrt_size - 1) % sqrt_size, 0, 
				 row_communicator, &request1);
		MPI_Isend(B_local_block, B_local_block_size, MPI_DOUBLE, 
				(coordinates[0] + sqrt_size - 1) % sqrt_size, 0, 
				 column_communicator, &request2);
		mpi_time += MPI_Wtime() - start;


		// compute partial result for this block cycle
		start = MPI_Wtime();
		for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){
		//MPI_Probe((coordinates[1] + 1) % sqrt_size,0,row_communicator,&status );
		//MPI_Probe((coordinates[0] + 1) % sqrt_size,0,column_communicator,&status );
			for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){
				for(A_column = 0; A_column < A_local_block_columns; A_column++){
					C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] *
						B_local_block[A_column * B_local_block_columns + B_column];
				}
			}
		}
		compute_time += MPI_Wtime() - start;


		start = MPI_Wtime();
		
		MPI_Wait(&request1,&status);
		MPI_Wait(&request2,&status);
		
		MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 
				(coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status);
		MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 
				(coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status);


		mpi_time += MPI_Wtime() - start;
	}

	// get C parts from other processes at rank 0
	if(rank == 0) {
		for(i = 0; i < A_local_block_rows * B_local_block_columns; i++){
			C_array[i] = C_local_block[i];
		}
		int i;
		for(i = 1; i < size; i++){
			MPI_Recv(C_array + (i * A_local_block_rows * B_local_block_columns), A_local_block_rows * B_local_block_columns, 
				MPI_DOUBLE, i, 0, cartesian_grid_communicator, &status);
		}
	} else {
		MPI_Send(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, 0, cartesian_grid_communicator);
	}

	// generating output at rank 0
	if (rank == 0) {
		// convert the ID array into the actual C matrix 
		int i, j, k, row, column;
		for (i = 0; i < sqrt_size; i++){  // block row index
			for (j = 0; j < sqrt_size; j++){ // block column index
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = 
							C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) 
							+ (row * B_local_block_columns) + column];
					}
				}
			}
		}

		printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns);
		printf("Computation time: %lf\n", compute_time);
		printf("MPI time:         %lf\n", mpi_time);

		if (argc == 4){
			// present results on the screen
			printf("\nA( %d x %d ):\n", A_rows, A_columns);
			for(row = 0; row < A_rows; row++) {
				for(column = 0; column < A_columns; column++)
					printf ("%7.3f ", A[row][column]);
				printf ("\n");
			}
			printf("\nB( %d x %d ):\n", B_rows, B_columns);
			for(row = 0; row < B_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ", B[row][column]);
				printf("\n");
			}
			printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns);
			for(row = 0; row < A_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ",C[row][column]);
				printf("\n");
			}


			printf("\nPerforming serial consistency check. Be patient...\n");
			fflush(stdout);
			int pass = 1;
			double temp;
			for(i=0; i<A_rows; i++){
				for(j=0; j<B_columns; j++){
					temp = 0;
					for(k=0; k<B_rows; k++){
						temp += A[i][k] * B[k][j];
					}
					printf("%7.3f ", temp);
					if(temp != C[i][j]){
						pass = 0;
					}
				}
				printf("\n");
			}
			if (pass) printf("Consistency check: PASS\n");
			else printf("Consistency check: FAIL\n");
		}	
	}

	// free all memory
	if(rank == 0){
		int i;
		for(i = 0; i < A_rows; i++){
			free(A[i]);
		}
		for(i = 0; i < B_rows; i++){
			free(B[i]);
		}
		for(i = 0; i < A_rows; i++){
			free(C[i]);
		}
		free(A);
		free(B);
		free(C);
		free(A_array);
		free(B_array);
		free(C_array);
	}
	free(A_local_block);
	free(B_local_block);
	free(C_local_block);

	// finalize MPI
	MPI_Finalize();
}
示例#26
0
文件: summa.c 项目: duygukan/mpi101
void SUMMA(MPI_Comm comm_cart, const int mb, const int nb, const int kb, double *A_loc, double *B_loc, double *C_loc) {

    // determine my cart coords
    int coords[2];
    MPI_Cart_coords(comm_cart, myrank, 2, coords);

    int my_col = coords[0];
    int my_row = coords[1];

    MPI_Comm row_comm;
    MPI_Comm col_comm;
    int remain_dims[2];
    
    // create row comms for A
    remain_dims[0] = 1; 
    remain_dims[1] = 0;
    MPI_Cart_sub(comm_cart, remain_dims, &row_comm);

    // create col comms for B
    remain_dims[0] = 0; 
    remain_dims[1] = 1;
    MPI_Cart_sub(comm_cart, remain_dims, &col_comm);

    double *A_loc_save = (double *) calloc(mb*nb, sizeof(double));
    double *B_loc_save = (double *) calloc(nb*kb, sizeof(double));
    double *C_loc_tmp = (double *) calloc(mb*kb, sizeof(double));

    // each proc should save its own A_loc, B_loc
    memcpy(A_loc_save, A_loc, mb*nb*sizeof(double));
    memcpy(B_loc_save, B_loc, nb*kb*sizeof(double));

    // C_loc = 0.0
    memset(C_loc, 0, mb*kb*sizeof(double));


    int nblks = n / nb;
    // ======== YOUR CODE HERE ============================
    // Implement main SUMMA loop here: 
    // root column (or row) should loop though nblks columns (rows).
    //
    // If processor's column coordinate equals to root, it should broadcast
    // its local portion of A within its `row_comm` communicator.
    //
    // If processor's row coordinate equals to root, it should broadcast
    // its local portion of B within its `col_comm` communicator.
    //
    // After broadcasting, call multiply_naive to multiply local portions
    // which each processor have received from others 
    // and store it in partial sum `C_loc_tmp`.
    //
    // Finally, accumulate partials sums of `C_loc_tmp` to `C_loc` on each iteration
    // using `plus_matrix` function.
    //
    // Tip: MPI_Bcast function uses same pointer to buffer on all processors,
    // but initially on root processor it contains necessary data, and receivers will
    // get data during MPI_Bcast. Be sure not to overwrite each proc's local matrix
    // during these operations. This is why we saved local parts in 
    // `A_loc_save` and `B_loc_save` in advance.
    //
    //
    // Sample solution:
    // for (int bcast_root = 0; bcast_root < nblks; ++bcast_root) {
    //
    //    int root_col = bcast_root;
    //    int root_row = bcast_root;
    //
    //    // owner of A_loc[root_col,:] will broadcast its block within row comm
    //    if (my_col == root_col) {
    //        // copy A_loc_save to A_loc
    //    }
    //    // broadcast A_loc from root_col within row_comm
    //
    //    // owner of B_loc[:,root_row] will broadcast its block within col comm
    //    if (my_row == root_row) {
    //        // copy B_loc_cave to B_loc
    //    }
    //    // broadcast B_loc from root_row within col_comm
    //
    //    // multiply local blocks A_loc, B_loc using matmul_naive
    //    // and store in C_loc_tmp
    //
    //    // C_loc = C_loc + C_loc_tmp using plus_matrix
    //}
    // ====================================================


    free(A_loc_save);
    free(B_loc_save);
    free(C_loc_tmp);
}
示例#27
0
文件: cart.c 项目: Shurakai/SimGrid
int main( int argc, char **argv )
{
    int              rank, size, i;
    int              errors=0;
    int              dims[NUM_DIMS];
    int              periods[NUM_DIMS];
    int              coords[NUM_DIMS];
    int              new_coords[NUM_DIMS];
    int              reorder = 1;
    MPI_Comm         comm_temp, comm_cart, new_comm;
    int              topo_status;
    int              ndims;
    int              new_rank;
    int              remain_dims[NUM_DIMS];
    int              newnewrank;

    MPI_Init( &argc, &argv );

    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
    MPI_Comm_size( MPI_COMM_WORLD, &size );

    /* Clear dims array and get dims for topology */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Dims_create ( size, NUM_DIMS, dims );

    /* Make a new communicator with a topology */
    MPI_Cart_create ( MPI_COMM_WORLD, 2, dims, periods, reorder, &comm_temp );
    MPI_Comm_dup ( comm_temp, &comm_cart );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( comm_cart, &topo_status );
    if (topo_status != MPI_CART) {
	printf( "topo_status of duped comm is not MPI_CART\n" );
	errors++;
    }

    /* How many dims do we have? */
    MPI_Cartdim_get( comm_cart, &ndims );
    if ( ndims != NUM_DIMS ) {
	printf( "Number of dims of duped comm (%d) should be %d\n", 
		ndims, NUM_DIMS );
	errors++;
    }

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( comm_cart, NUM_DIMS, dims, periods, coords );

    /* Does the mapping from coords to rank work? */
    MPI_Cart_rank ( comm_cart, coords, &new_rank );
    if ( new_rank != rank ) {
	printf( "New rank of duped comm (%d) != old rank (%d)\n", 
		new_rank, rank );
	errors++;
    }

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( comm_cart, rank, NUM_DIMS, new_coords );
    for (i=0;i<NUM_DIMS;i++) 
	if ( coords[i] != new_coords[i] ) {
	    printf( "Old coords[%d] of duped comm (%d) != new_coords (%d)\n", 
		    i, coords[i], new_coords[i] );
	    errors++;
	}

    /* Let's shift in each dimension and see how it works!   */
    /* Because it's late and I'm tired, I'm not making this  */
    /* automatically test itself.                            */
    for (i=0;i<NUM_DIMS;i++) {
      int source, dest;
      MPI_Cart_shift(comm_cart, i, 1, &source, &dest);
#ifdef VERBOSE      
      printf ("[%d] Shifting %d in the %d dimension\n",rank,1,i);
      printf ("[%d]    source = %d  dest = %d\n",rank,source,dest); 
#endif
    }

    /* Subdivide */
    remain_dims[0] = 0; 
    for (i=1; i<NUM_DIMS; i++) remain_dims[i] = 1;
    MPI_Cart_sub ( comm_cart, remain_dims, &new_comm );

    /* Determine the status of the new communicator */
    MPI_Topo_test ( new_comm, &topo_status );
    if (topo_status != MPI_CART) {
	printf( "topo_status of cartsub comm is not MPI_CART\n" );
	errors++;
    }

    /* How many dims do we have? */
    MPI_Cartdim_get( new_comm, &ndims );
    if ( ndims != NUM_DIMS-1 ) {
	printf( "Number of dims of cartsub comm (%d) should be %d\n", 
		ndims, NUM_DIMS-1 );
	errors++;
    }

    /* Get the topology, does it agree with what we put in? */
    for(i=0;i<NUM_DIMS-1;i++) { dims[i] = 0; periods[i] = 0; }
    MPI_Cart_get ( new_comm, ndims, dims, periods, coords );
    
    /* Does the mapping from coords to rank work? */
    MPI_Comm_rank ( new_comm, &newnewrank );
    MPI_Cart_rank ( new_comm, coords, &new_rank );
    if ( new_rank != newnewrank ) {
	printf( "New rank of cartsub comm (%d) != old rank (%d)\n", 
		new_rank, newnewrank );
	errors++;
    }

    /* Does the mapping from rank to coords work */
    MPI_Cart_coords ( new_comm, new_rank, NUM_DIMS -1, new_coords );
    for (i=0;i<NUM_DIMS-1;i++) 
	if ( coords[i] != new_coords[i] ) {
	    printf( "Old coords[%d] of cartsub comm (%d) != new_coords (%d)\n", 
		    i, coords[i], new_coords[i] );
	    errors++;
	}

    /* We're at the end */
    MPI_Comm_free( &new_comm );
    MPI_Comm_free( &comm_temp );
    MPI_Comm_free( &comm_cart );
    Test_Waitforall( );
    if (errors) printf( "[%d] done with %d ERRORS!\n", rank,errors );
    MPI_Finalize();
    return 0;
}
示例#28
0
文件: runcartrec.cpp 项目: C-CINA/2dx
int main(int argc, char *argv[])
{
  int ncpus, mypid, nrem, ierr;
  MPI_Status mpistatus;
  MPI_Comm comm = MPI_COMM_WORLD;
  
  // Variables needed for the Cartesian topology.
  int ROW = 0, COL = 1;
  int dims[2], periods[2], keep_dims[2];
  int my2dpid, mycoords[2], srcoords[2], otherpid;
  MPI_Comm comm_2d, comm_row, comm_col; 
		  
// Initialize MPI.
  MPI_Init(&argc, &argv);
  MPI_Comm_size(comm, &ncpus);
  MPI_Comm_rank(comm, &mypid);
  
  if ( argc < 3  ) {
	  printf ("ERROR: %s requires Cartesian dimensions input\n", argv[0]);
	  return -1;
  }
// Set up a Cartesian virtual topology and get the rank and coordinates of the processes in the topology. 
  dims[ROW] = atoi(argv[1]); // Row dimension of the topology
  dims[COL] = atoi(argv[2]); // Column dimension of the topology
  
  if (dims[ROW]*dims[COL] != ncpus){
	printf("ERROR: Row dim and col dim not equal to ncpus\n");
	return -1;
  }
  
  periods[ROW] = periods[COL] = 1; // Set the periods for wrap-around
  
  MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d);
  MPI_Comm_rank(comm_2d, &my2dpid); //Get my pid in the new 2D topology
  MPI_Cart_coords(comm_2d, my2dpid, 2, mycoords); // Get my coordinates
  
  /* Create the row-based sub-topology */ 
  keep_dims[ROW] = 0; 
  keep_dims[COL] = 1; 
  MPI_Cart_sub(comm_2d, keep_dims, &comm_row); 
 
  /* Create the column-based sub-topology */ 
  keep_dims[ROW] = 1; 
  keep_dims[COL] = 0; 
  MPI_Cart_sub(comm_2d, keep_dims, &comm_col); 

// STEP 1: Have processor (0,0) read in the entire set of 2D images, divide up the images, and send corresponding images to processors in processor group: g_c_0  Do the same for the angles.
  
  if (mycoords[ROW] == 0 && mycoords[COL] == 0){ //I'm processor (0,0)
    FILE *fp, *fpa;
    char imagefname[80]="tf2d84.raw", anglesfname[80]="angles.dat";
	  
    fp = fopen(imagefname,"r");
    fread(&nangs, sizeof(int), 1, fp);
    fread(&nx, sizeof(int), 1, fp);
    fread(&ny, sizeof(int), 1, fp);
    
    images = new float[nx*ny*nangs];
    fread(images, sizeof(float), nx*ny*nangs, fp);
    fclose(fp);
    
    fpa = fopen(anglesfname,"r");
    angles = new float[3*nangs];
    for (int i = 0; i< 3*nangs; i++)
      fscanf(fpa, "%f",&angles[i]);
       
    fclose(fpa);
    printf("There are %d 2D images of size %d x %d\n", nangs, nx, ny);
  }
  
  // Broadcast variables nangs, nx, ny to all processors
  srcoords[ROW] = srcoords[COL] = 0;
  MPI_Cart_rank(comm_2d, srcoords, &otherpid); 
  
  MPI_Bcast (&nangs, 1, MPI_INT, otherpid, comm_2d);
  MPI_Bcast (&nx, 1, MPI_INT, otherpid, comm_2d);
  MPI_Bcast (&ny, 1, MPI_INT, otherpid, comm_2d);
  
  // Send images and angles from Processor (0,0) to processors in group g_c_0
  int *psize = new int[dims[ROW]];
  int *nbase = new int[dims[ROW]];
  
  nangsloc = setpart_gc1(comm_2d, nangs, psize, nbase);
  imagesloc = new float[psize[mycoords[ROW]]*nx*ny];
  reprojloc = new float[psize[mycoords[ROW]]*nx*ny];
  anglesloc = new float[psize[mycoords[ROW]]*3];
  
// printf("My coords are (%d,%d) and nangsloc = %d\n", mycoords[ROW], mycoords[COL], nangsloc);
  
  if (mycoords[COL] == 0 && mycoords[ROW] == 0) { //I'm Proc. (0,0)
    for(int ip = 0; ip < dims[ROW]; ++ip){
      int begidx = nbase[ip]*nx*ny;
      if (ip !=0){ // Proc (0,0) sends images and angle data to other processors
	 srcoords[COL] = 0;
	 srcoords[ROW] = ip;
	 MPI_Cart_rank(comm_2d, srcoords, &otherpid);
	 MPI_Send(&images[begidx],psize[ip]*nx*ny, MPI_FLOAT, otherpid, otherpid, comm_2d);
	 MPI_Send(&angles[nbase[ip]*3],psize[ip]*3, MPI_FLOAT, otherpid, otherpid, comm_2d);
      }
      else{ // ip = 0: Proc (0,0) needs to copy images and angles into its imagesloc and anglesloc
	for (int i = 0; i < psize[ip]*nx*ny; i++){
	  imagesloc[i] = images[begidx+i];
	}
	for (int i = 0; i < psize[ip]*3; i++){
		anglesloc[i] = angles[nbase[ip]*3 + i];
	}
	//printf("Finished copying to Proc (0,0) local");
      }
    } //End for loop
  } //End if
  
  if (mycoords[COL] == 0 && mycoords[ROW] != 0) { //I'm in g_c_0 and I'm not Processor (0,0) so I should receive data.
    MPI_Recv(imagesloc, psize[mycoords[ROW]]*nx*ny, MPI_FLOAT, 0, mypid, comm_2d, &mpistatus);
    MPI_Recv(anglesloc, psize[mycoords[ROW]]*3, MPI_FLOAT, 0, mypid, comm_2d, &mpistatus);
  }
  // Now have all the processors in group g_c_0 broadcast the images and angles along the row communicator
  srcoords[ROW] = 0;
  MPI_Cart_rank(comm_row, srcoords, &otherpid);
  MPI_Bcast(imagesloc, nangsloc*nx*ny, MPI_FLOAT, otherpid , comm_row);
  MPI_Bcast(anglesloc, nangsloc*3, MPI_FLOAT, otherpid , comm_row);
 
// Now distribute the volume (in spherical format) among columns of processors and use nnz to determine the splitting.  Note: ptrs and coord are on all processors
  int radius;
  int volsize[3], origin[3];
  volsize[0] = nx;
  volsize[1] = nx;
  volsize[2] = nx;
  origin[0] = nx/2+1;
  origin[1] = nx/2+1;
  origin[2] = nx/2+1;
  radius = nx/2-1;
   
  ierr = getnnz( volsize, radius, origin, &nrays, &nnz);
   
  int * ptrs = new int[nrays+1];
  int * cord = new int[3*nrays];
  ierr = getcb2sph(volsize, radius, origin, nnz, ptrs, cord);
		  
  int *nnzpart = new int[dims[COL]];
  int *nnzbase = new int[dims[COL]+1]; 
  nnzloc = setpart_gr1(comm_2d, nnz, nnzpart, nnzbase);
  
  int *ptrstart = new int[dims[COL]+1];
  nraysloc = sphpart(comm_2d, nrays, ptrs, nnzbase, ptrstart);
  
  myptrstart = ptrstart[mycoords[COL]];
  int nnzall[dims[COL]];
  for (int i = 0; i<dims[COL]; i++)
    nnzall[i] = ptrs[ptrstart[i+1]] - ptrs[ptrstart[i]];
  
  nnzloc = nnzall[mycoords[COL]];
  
  // Print some stuff.
 printf("My coords are (%d,%d) and nangsloc = %d, nraysloc = %d, myptrstart = %d, nnzloc = %d\n", mycoords[ROW], mycoords[COL], nangsloc, nraysloc, myptrstart, nnzloc);
  
  float *bvol_loc = new float[nnzloc];
  float *vol_sphloc = new float[nnzloc];
  for (int i=0; i< nnzloc; i++)
    bvol_loc[i] = 0.0;
  
  // STEP 2: Have everyone perform the backprojection operation for their assigned images and portions of the volume.  Then perform an Allreduce along the columns.
  
  float phi, theta, psi;
  float dm[8];
  
  for (int i=0; i<nangsloc; i++){
    phi = anglesloc[3*i+0];
    theta = anglesloc[3*i+1];
    psi = anglesloc[3*i+2];
    dm[6] = 0;
    dm[7] = 0;
 
    make_proj_mat(phi, theta, psi, dm);

    ierr = bckpj3_Cart(volsize, nraysloc, nnzloc, dm, origin, radius, ptrs, cord, myptrstart, &imagesloc[nx*ny*i], bvol_loc);
  }
  
  // Now an all reduce along the columns
  MPI_Allreduce (bvol_loc, vol_sphloc, nnzloc, MPI_FLOAT, MPI_SUM, comm_col);
  
  // For testing purposes, we bring all the portions of the volume back together onto Proc (0,0). Note: we only need to deal with the first row of processors.
  if (mycoords[COL] != 0 && mycoords[ROW] == 0) {
	//Send data to Processor (0,0)
	srcoords[COL] = srcoords[ROW] = 0;
	MPI_Cart_rank(comm_2d, srcoords, &otherpid);
	MPI_Send(vol_sphloc, nnzloc, MPI_FLOAT, otherpid, otherpid, comm_2d);
  }
  float *onevol_sph = new float[nnz];
  if (mycoords[COL] == 0 && mycoords[ROW] ==0){
	  //Copy data and recieve data
	float *vol_sph = new float[nnz];
	for (int i=0; i<nnzloc; i++)
	  vol_sph[i] = vol_sphloc[i];
	
	for (int i=1; i<dims[COL]; i++){
	  srcoords[ROW] = 0;
	  srcoords[COL] = i;
	  MPI_Cart_rank(comm_2d, srcoords, &otherpid);
	 	
	  MPI_Recv(&vol_sph[ptrs[ptrstart[i]]-1], nnzall[i], MPI_FLOAT, otherpid, mypid, comm_2d, &mpistatus);
	}
	//printf("Finished combining all volume parts\n");
  
  //Now compute the back projection serially on one processor (0,0)
    for (int i=0; i< nnz; i++)
      onevol_sph[i] = 0.0;
    
    for (int i=0; i<nangs; i++){
    	    phi = angles[3*i+0];
	    theta = angles[3*i+1];
	    psi = angles[3*i+2];
	    dm[6] = 0;
	    dm[7] = 0;
 
	    make_proj_mat(phi, theta, psi, dm);
  
	    ierr = bckpj3(volsize, nrays, nnz, dm, origin, radius, ptrs, cord, &images[nx*ny*i], onevol_sph);
    }
    
    float err=0;
    for (int i=0; i< nnz; i++){
	    err = err+(onevol_sph[i]-vol_sph[i])*(onevol_sph[i]-vol_sph[i]);
    }
    err = sqrt(err);
    printf("Cumulative error for backprojection is %f\n", err);
    delete [] vol_sph;
  }
  
    // STEP 3: Now perform a forward projection operation for the assigned images and portions of the volume.  Then perform an all_reduce along the rows.
  float * newimagesloc = new float[nangsloc*nx*ny];
  for (int i=0; i<nangsloc*nx*ny; i++)
	  newimagesloc[i] = 0.0;
  	
  for (int i=0; i<nangsloc; i++){
  phi = anglesloc[3*i+0];
	  theta = anglesloc[3*i+1];
	  psi = anglesloc[3*i+2];
	  dm[6] = 0;
	  dm[7] = 0;
    
	  make_proj_mat(phi, theta, psi, dm);
  
	 ierr = fwdpj3_Cart(volsize, nraysloc, nnzloc, dm, origin, radius, ptrs, cord, myptrstart, vol_sphloc, &newimagesloc[nx*ny*i]);
  }

  // Now an all reduce along the rows
  MPI_Allreduce (newimagesloc, reprojloc, nangsloc*nx*ny, MPI_FLOAT, MPI_SUM, comm_row);
 
  delete [] newimagesloc;
  // For testing purposes, we bring all the 2D images together onto Proc (0,0). Note: we only need to deal with the first column of processors.
  if (mycoords[ROW] != 0 && mycoords[COL] == 0) {
	//Send data to Processor (0,0)
	  srcoords[COL] = srcoords[ROW] = 0;
	  MPI_Cart_rank(comm_2d, srcoords, &otherpid);
	  MPI_Send(reprojloc, nangsloc*nx*ny, MPI_FLOAT, otherpid, otherpid, comm_2d);
  }
  if (mycoords[COL] == 0 && mycoords[ROW] ==0){
	  //Copy data and recieve data
	  float *reproj = new float[nangs*nx*ny];
	  for (int i=0; i<nangsloc*nx*ny; i++)
		  reproj[i] = reprojloc[i];
	
	  for (int i=1; i<dims[ROW]; i++){
		  srcoords[COL] = 0;
		  srcoords[ROW] = i;
		  MPI_Cart_rank(comm_2d, srcoords, &otherpid);
	 	
		  MPI_Recv(&reproj[nbase[i]*nx*ny], psize[i]*nx*ny, MPI_FLOAT, otherpid, mypid, comm_2d, &mpistatus);
	  }
  delete [] reprojloc;
  // Now compute the forward projection serially on one processor (0,0)

	  float *allimages = new float[nangs*nx*ny];
	  for (int i=0; i< nangs*nx*ny; i++)
		  allimages[i] = 0.0;
    
	  for (int i=0; i<nangs; i++){
	    phi = angles[3*i+0];
	    theta = angles[3*i+1];
	    psi = angles[3*i+2];
	    dm[6] = 0;
	    dm[7] = 0;
    
	    make_proj_mat(phi, theta, psi, dm);
		  
	    ierr = fwdpj3(volsize, nrays, nnz, dm, origin, radius, ptrs, cord, onevol_sph, &allimages[nx*ny*i]);

	  }

	  //  Now compute the overall error.
int idx;	
  float err=0, max =0;
	  for (int i=0; i< nangs*nx*ny; i++){
		//if (allimages[i]!=reproj[i] && i < 256)
		//	printf("i= %d\n",i);
		
	    err = err+(allimages[i]-reproj[i])*(allimages[i]-reproj[i]);

		if (fabs(allimages[i]-reproj[i]) > max){
			max = fabs(allimages[i]-reproj[i]);
idx = i;
}
	  }
err = sqrt(err);  
	  printf("Cumulative error for forward projection is %f with max error of %f occuring at %d\n", err, max, idx);
printf("Max error: compare %f and %f\n",allimages[idx], reproj[idx]);

	  delete [] reproj;
	  delete [] allimages;
	  delete [] angles;
	  delete [] images;
  }
  delete [] onevol_sph;
  delete [] vol_sphloc;
  delete [] bvol_loc;  
  delete [] ptrs;
  delete [] cord;
  delete [] nnzpart;
  delete [] nnzbase;
  delete [] ptrstart;
  delete [] anglesloc;
  delete [] imagesloc;
  delete [] nbase;
  delete [] psize;	  
  
  MPI_Comm_free(&comm_2d);
  MPI_Comm_free(&comm_row);
  MPI_Comm_free(&comm_col);
  
  MPI_Finalize();
}
示例#29
0
MatrixVectorMultiply2D(int n, double *a, double *x, double *y, MPI_Comm comm_2d) 
{ 
  int ROW=0, COL=1; /* Improve readability for indices */ 
  int i, j, nlocal; 
  double *py; /* Will store partial dot products for y */
  /* Variables are as follows:
        - npes = # of processing elements 
		- dims = size of matrix in x and y dimensions
		- keep_dims = used to filter out dimensions when creating sub-topologies
  */
  int npes, dims[2], periods[2], keep_dims[2], keep_dims2[2]; 
  /* Other variables are used to create sub-topologies and refer to individual 
	 processing elements */
  int myrank, mycoords[2], mycolrank, myrowrank; 
  int source_rank, dest_rank, root_rank, col_rank, coords[2], coord[1];
  MPI_Status status; 
  MPI_Comm comm_row, comm_col; 

  /* Get information about the communicator */ 
  MPI_Comm_size(comm_2d, &npes); 
  MPI_Comm_rank(comm_2d, &myrank); 

  /* Compute the size of the square grid. If a square grid is not used, 
	 changes the values here */ 
  dims[ROW] = dims[COL] = sqrt(npes); 

  nlocal = n/dims[ROW]; 

  /* Allocate memory for the array that will hold the partial dot-products */ 
  py = malloc(nlocal*sizeof(double)); 

  MPI_Cart_coords(comm_2d, myrank, 2, mycoords); /* Get my coordinates */ 
 
 /*****************************************************/
 /* Create the row-based sub-topology */ 
  keep_dims[ROW] = 0; 
  keep_dims[COL] = 1; 
  MPI_Cart_sub(comm_2d, keep_dims, &comm_row); 
  
  MPI_Comm_rank(comm_row, &myrowrank);
  
  /* Create the column-based sub-topology */ 
  keep_dims2[ROW] = 1;
  keep_dims2[COL] = 0;
  MPI_Cart_sub(comm_2d, keep_dims2, &comm_col);

  MPI_Comm_rank(comm_col, &mycolrank);
  
  /****************************************/
  /* Redistribute the x vector. */ 
  /* Step 1. The processors along the rightmost column send their data to the diagonal processors */ 
  /* If I'm in the rightmost column but not the last row, send my block
     of the vector to the diagonal processor in my row */ 
  /*****************************************************/
  
  /* printf("STEP1: I am processor %d at position (%d, %d)", */
  /* 	 myrank, mycoords[ROW], mycoords[COL]); */
  // determine if in right column
  if (mycoords[COL] == dims[COL]-1){
      // if in right column, then check for !in_final_row
      if (mycoords[ROW] != dims[ROW] - 1){
	  // if not, then send the info to the element located in the
	  //     2d cartesian topology at location (i,j) where i == j.
	  //     Also, i will be equal to mycoords[ROW]
	  // get rank of dest
	  coords[ROW] = coords[COL] = mycoords[ROW];
	  MPI_Cart_rank(comm_2d, coords, &dest_rank);
	  // send to rank
	  MPI_Send(x, nlocal, MPI_DOUBLE, dest_rank, 0, comm_2d);
      }
  }

  /*****************************************************/
  /* If I'm on the diagonal but not in the last row, receive the block
     of the vector from the processor in the rightmost column of my row */

  /* printf("STEP1b: I am processor %d in  at position (%d, %d)", */
  /* 	 myrank, mycoords[ROW], mycoords[COL]); */
  
  if (mycoords[ROW] == mycoords[COL] && mycoords[ROW] != dims[ROW]-1){
      // determine source_rank
      coords[ROW] = mycoords[ROW];
      coords[COL] = dims[COL]-1;
      MPI_Cart_rank(comm_2d, coords, &source_rank);
      // receive data from source
      MPI_Recv(x, nlocal, MPI_DOUBLE, source_rank, 0, comm_2d, &status);
  }

  /*****************************************************/ 
  /* Step 2. Perform a column-wise broadcast with the diagonal process 
             as the root  */ 
  /*******************************************************/
  /* printf("STEP2: I am processor %d at position (%d, %d)", */
  /* 	 myrank, mycoords[ROW], mycoords[COL]); */

  // if diagonal element, just broadcast
  if (mycoords[ROW] == mycoords[COL]){
      MPI_Bcast(x, nlocal, MPI_DOUBLE, mycolrank, comm_col);
  }
  else { 
      // get rank of current column's diagonal element
      coord[0] = mycoords[COL];
      MPI_Cart_rank(comm_col, coord, &col_rank);
      // get column based rank
      
      MPI_Bcast(x, nlocal, MPI_DOUBLE, col_rank, comm_col);
  }
   
  /* Perform local matrix-vector multiply */ 
  for (i=0; i<nlocal; i++) { 
    py[i] = 0.0; 
    for (j=0; j<nlocal; j++) 
      py[i] += a[i*nlocal+j]*x[j]; 
  } 
  /*****************************************************/ 
  /* Step 3. Perform the sum-reduction along the rows to add up the partial 
     dot-products and leave the result in the rightmost column */ 
  /*****************************************************/ 
  /* printf("STEP3: I am processor %d in the right column at position (%d, %d)", */
  /* 	 myrank, mycoords[ROW], mycoords[COL]); */

  // check if this is the results column
  if (mycoords[COL] == dims[COL]-1){
      // receive results from reduce
      MPI_Reduce(py, y, nlocal, MPI_DOUBLE, MPI_SUM, myrowrank, comm_row);
  }
  else{ // pass results to the right
      // determine rank of right-most column processor
      coord[ROW] = dims[ROW]-1;
      MPI_Cart_rank(comm_row, coord, &root_rank);
      MPI_Reduce(py, y, nlocal, MPI_DOUBLE, MPI_SUM, root_rank, comm_row);
  }
  
  /* free local communicators */
  MPI_Comm_free(&comm_row); /* Free up communicator */ 
  MPI_Comm_free(&comm_col); /* Free up communicator */ 
 
  free(py); 
} 
示例#30
0
int
main (int argc, char **argv)
{
  int nprocs = -1;
  int rank = -1;
  int i, j;
  int *granks;
  char processor_name[128];
  int namelen = 128;
  int buf[buf_size];
  MPI_Status status;
  MPI_Comm temp;
  MPI_Comm intercomm = MPI_COMM_NULL;
  MPI_Comm dcomms[DCOMM_CALL_COUNT];
  MPI_Group world_group, dgroup;
  int intersize, dnprocs[DCOMM_CALL_COUNT], drank[DCOMM_CALL_COUNT];
  int dims[TWOD], periods[TWOD], remain_dims[TWOD];
  int graph_index[] = { 2, 3, 4, 6 };
  int graph_edges[] = { 1, 3, 0, 3, 0, 2 };

  /* init */
  MPI_Init (&argc, &argv);
  MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Get_processor_name (processor_name, &namelen);
  printf ("(%d) is alive on %s\n", rank, processor_name);
  fflush (stdout);

  MPI_Barrier (MPI_COMM_WORLD);

  /* probably want number to be higher... */
  if (nprocs < 4) {
      printf ("not enough tasks\n");
  }
  else {
    if (DCOMM_CALL_COUNT > 0) {
#ifdef RUN_COMM_DUP
      /* create all of the derived communicators... */
      /* simplest is created by MPI_Comm_dup... */
      MPI_Comm_dup (MPI_COMM_WORLD, &dcomms[0]);
#else
      dcomms[0] = MPI_COMM_NULL;
#endif
    }

    if (DCOMM_CALL_COUNT > 1) {
#ifdef RUN_COMM_CREATE
      /* use subset of MPI_COMM_WORLD group for MPI_Comm_create... */
      MPI_Comm_group (MPI_COMM_WORLD, &world_group);
      granks = (int *) malloc (sizeof(int) * (nprocs/2));
      for (i = 0; i < nprocs/2; i++)
	granks [i] = 2 * i;
      MPI_Group_incl (world_group, nprocs/2, granks, &dgroup);
      MPI_Comm_create (MPI_COMM_WORLD, dgroup, &dcomms[1]);
      MPI_Group_free (&world_group);
      MPI_Group_free (&dgroup);
      free (granks);
#else
      dcomms[1] = MPI_COMM_NULL;
#endif
    }

    if (DCOMM_CALL_COUNT > 2) {
#ifdef RUN_COMM_SPLIT
      /* split into thirds with inverted ranks... */
      MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &dcomms[2]);
#else
      dcomms[2] = MPI_COMM_NULL;
#endif
    }

#ifdef RUN_INTERCOMM_CREATE
    if ((DCOMM_CALL_COUNT < 2) || (dcomms[2] == MPI_COMM_NULL)) {
      MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &temp);
    }
    else {
      temp = dcomms[2];
    }
    if (rank % 3) {
      MPI_Intercomm_create (temp, 0, MPI_COMM_WORLD,
			    (((nprocs % 3) == 2) && ((rank % 3) == 2)) ?
			    nprocs - 1 : nprocs - (rank % 3) - (nprocs % 3),
			    INTERCOMM_CREATE_TAG, &intercomm);
    }
    if ((DCOMM_CALL_COUNT < 2) || (dcomms[2] == MPI_COMM_NULL)) {
      MPI_Comm_free (&temp);
    }
#endif

    if (DCOMM_CALL_COUNT > 3) {
#ifdef RUN_CART_CREATE
      /* create a 2 X nprocs/2 torus topology, allow reordering */
      dims[0] = 2;
      dims[1] = nprocs/2;
      periods[0] = periods[1] = 1;
      MPI_Cart_create (MPI_COMM_WORLD, TWOD, dims, periods, 1, &dcomms[3]);
#else
      dcomms[3] = MPI_COMM_NULL;
#endif
    }

    if (DCOMM_CALL_COUNT > 4) {
#ifdef RUN_GRAPH_CREATE
      /* create the graph on p.268 MPI: The Complete Reference... */
      MPI_Graph_create (MPI_COMM_WORLD, GRAPH_SZ,
			graph_index, graph_edges, 1, &dcomms[4]);
#else
      dcomms[4] = MPI_COMM_NULL;
#endif
    }

    if (DCOMM_CALL_COUNT > 5) {
#ifdef RUN_CART_SUB
#ifndef RUN_CART_CREATE
      /* need to make cartesian communicator temporarily... */
      /* create a 2 X nprocs/2 torus topology, allow reordering */
      dims[0] = 2;
      dims[1] = nprocs/2;
      periods[0] = periods[1] = 1;
      MPI_Cart_create (MPI_COMM_WORLD, TWOD, dims, periods, 1, &dcomms[3]);
#endif
      if (dcomms[3] != MPI_COMM_NULL) {
	/* create 2 1 X nprocs/2 topologies... */
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	MPI_Cart_sub (dcomms[3], remain_dims, &dcomms[5]);
#ifndef RUN_CART_CREATE
	/* free up temporarily created cartesian communicator... */
	MPI_Comm_free (&dcomms[3]);
#endif
      }
      else {
	dcomms[5] = MPI_COMM_NULL;
      }
#else
      dcomms[5] = MPI_COMM_NULL;
#endif
    }

    if (DCOMM_CALL_COUNT > 6) {
#ifdef RUN_INTERCOMM_MERGE
#ifndef RUN_INTERCOMM_CREATE
#ifndef RUN_COMM_SPLIT
      /* need to make split communicator temporarily... */
      /* split into thirds with inverted ranks... */
      MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &dcomms[2]);
#endif
#endif
      /* create an intercommunicator and merge it... */
      if (rank % 3) {
#ifndef RUN_INTERCOMM_CREATE
	MPI_Intercomm_create (dcomms[2], 0, MPI_COMM_WORLD,
			      (((nprocs % 3) == 2) && ((rank % 3) == 2)) ?
			      nprocs - 1 : nprocs - (rank % 3) - (nprocs % 3),
			      INTERCOMM_CREATE_TAG, &intercomm);
#endif

	MPI_Intercomm_merge (intercomm, ((rank % 3) == 1), &dcomms[6]);

#ifndef RUN_INTERCOMM_CREATE
	/* we are done with intercomm... */
	MPI_Comm_free (&intercomm);
#endif
      }
      else {
	dcomms[6] = MPI_COMM_NULL;
      }
#ifndef RUN_INTERCOMM_CREATE
#ifndef RUN_COMM_SPLIT
      if (dcomms[2] != MPI_COMM_NULL)
	/* free up temporarily created split communicator... */
	MPI_Comm_free (&dcomms[2]);
#endif
#endif
#else
      dcomms[6] = MPI_COMM_NULL;
#endif
    }

    /* get all of the sizes and ranks... */
    for (i = 0; i < DCOMM_CALL_COUNT; i++) {
      if (dcomms[i] != MPI_COMM_NULL) {
	MPI_Comm_size (dcomms[i], &dnprocs[i]);
	MPI_Comm_rank (dcomms[i], &drank[i]);
      }
      else {
	dnprocs[i] = 0;
	drank[i] = -1;
      }
    }

#ifdef RUN_INTERCOMM_CREATE
    /* get the intercomm remote size... */
    if (rank % 3) {
      MPI_Comm_remote_size (intercomm, &intersize);
    }
#endif

    /* do some point to point on all of the dcomms... */
    for (i = 0; i < DCOMM_CALL_COUNT; i++) {
      if (dnprocs[i] > 1) {
	if (drank[i] == 0) {
	  for (j = 1; j < dnprocs[i]; j++) {
	    MPI_Recv (buf, buf_size, MPI_INT, j, 0, dcomms[i], &status);
	  }
	}
	else {
	  memset (buf, 1, buf_size*sizeof(int));

	  MPI_Send (buf, buf_size, MPI_INT, 0, 0, dcomms[i]);
	}
      }
    }

#ifdef RUN_INTERCOMM_CREATE
    /* do some point to point on the intercomm... */
    if ((rank % 3) == 1) {
      for (j = 0; j < intersize; j++) {
	MPI_Recv (buf, buf_size, MPI_INT, j, 0, intercomm, &status);
      }
    }
    else if ((rank % 3) == 2) {
      for (j = 0; j < intersize; j++) {
	memset (buf, 1, buf_size*sizeof(int));

	MPI_Send (buf, buf_size, MPI_INT, j, 0, intercomm);
      }
    }
#endif

    /* do a bcast on all of the dcomms... */
    for (i = 0; i < DCOMM_CALL_COUNT; i++) {
      /* IBM's implementation gets error with comm over MPI_COMM_NULL... */
      if (dnprocs[i] > 0)
	MPI_Bcast (buf, buf_size, MPI_INT, 0, dcomms[i]);
    }

    /* use any source receives... */
    for (i = 0; i < DCOMM_CALL_COUNT; i++) {
      if (dnprocs[i] > 1) {
	if (drank[i] == 0) {
	  for (j = 1; j < dnprocs[i]; j++) {
	    MPI_Recv (buf, buf_size, MPI_INT,
		      MPI_ANY_SOURCE, 0, dcomms[i], &status);
	  }
	}
	else {
	  memset (buf, 1, buf_size*sizeof(int));

	  MPI_Send (buf, buf_size, MPI_INT, 0, 0, dcomms[i]);
	}
      }
    }

#ifdef RUN_INTERCOMM_CREATE
    /* do any source receives on the intercomm... */
    if ((rank % 3) == 1) {
      for (j = 0; j < intersize; j++) {
	MPI_Recv (buf, buf_size, MPI_INT,
		  MPI_ANY_SOURCE, 0, intercomm, &status);
      }
    }
    else if ((rank % 3) == 2) {
      for (j = 0; j < intersize; j++) {
	memset (buf, 1, buf_size*sizeof(int));

	MPI_Send (buf, buf_size, MPI_INT, j, 0, intercomm);
      }
    }
#endif

    /* do a barrier on all of the dcomms... */
    for (i = 0; i < DCOMM_CALL_COUNT; i++) {
      /* IBM's implementation gets with communication over MPI_COMM_NULL... */
      if (dnprocs[i] > 0)
	MPI_Barrier (dcomms[i]);
    }

    /* free all of the derived communicators... */
    for (i = 0; i < DCOMM_CALL_COUNT; i++) {
      /* freeing MPI_COMM_NULL is explicitly defined as erroneous... */
      if (dnprocs[i] > 0)
	MPI_Comm_free (&dcomms[i]);
    }

#ifdef RUN_INTERCOMM_CREATE
    if (rank % 3)
      /* we are done with intercomm... */
      MPI_Comm_free (&intercomm);
#endif
  }

  MPI_Barrier (MPI_COMM_WORLD);

  MPI_Finalize ();
  printf ("(%d) Finished normally\n", rank);
}