void create_grid(int myrank, int gd, MPI_Comm* comm_grid, MPI_Comm* comm_row, MPI_Comm* comm_col) { int dims[2] = {gd, gd}; int coords[2]; // coords[0] = i, coords[1] = j int periods[2]; int reorder; int grid_rank; int subdivision[2]; periods[0] = 0 ; periods[1] = 1 ; reorder = 1 ; MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, reorder, comm_grid); MPI_Cart_coords(*comm_grid, myrank, 2, coords); //Outputs the i,j coordinates of the process MPI_Cart_rank(*comm_grid, coords, &grid_rank); //Outputs the rank of the process subdivision[0] = 1; subdivision[1] = 0; MPI_Cart_sub (*comm_grid,subdivision,comm_col); // Communicator between lines subdivision[0] = 0; subdivision[1] = 1; MPI_Cart_sub (*comm_grid,subdivision,comm_row); // Communicator between row }
void grid_setup(struct grid_info *grid) { /* obtener datos globales */ MPI_Comm_size(MPI_COMM_WORLD, &(grid->nr_world_processes)); MPI_Comm_rank(MPI_COMM_WORLD, &(grid->my_world_rank)); /* calcular cuantos procesos por lado tendra la grilla */ grid->ppside = intsqrt(grid->nr_world_processes); /* crear comunicador para topologia de grilla */ int dimensions[2] = {grid->ppside, grid->ppside}; int wrap_around[2] = {TRUE, TRUE}; int reorder = TRUE; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, reorder, &(grid->comm)); MPI_Comm_rank(grid->comm, &(grid->my_rank)); /* obtener coordenadas grillisticas del proceso */ int coordinates[2]; MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates); grid->my_row = coordinates[0]; grid->my_col = coordinates[1]; /* obtener comunicadores para la fila y la columna del proceso */ int free_coords_for_rows[] = {FALSE, TRUE}; int free_coords_for_cols[] = {TRUE, FALSE}; MPI_Cart_sub(grid->comm, free_coords_for_rows, &(grid->row_comm)); MPI_Cart_sub(grid->comm, free_coords_for_cols, &(grid->col_comm)); }
void initialiseMonde(int argc, char** argv) { int remain[2], periods[2], reorder, i, nb_thread; MPI_Init (&argc, &argv); /* starts MPI */ MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ // On vérifie qu'il existe au moins un deuxieme argument if(argc < 3) { MPI_Finalize(); if(rank == 0) printf("Le nombre d'arguments n'est pas suffisant\nAssurez vous de préciser en premier argument le nombre de thread puis la taille de matrice"); exit(1); } nb_thread = atoi(argv[1]); omp_set_num_threads(nb_thread); taille_matrice = atoi(argv[2]); racine = sqrt(size); if (racine * racine != size || taille_matrice % racine != 0) { MPI_Finalize(); if(rank == 0) printf("Le nombre de processus MPI n'est pas cohérent avec la taille de la matrice\n"); exit(1); } taille_block = taille_matrice / racine; tab_dim[LIGNE] = racine; tab_dim[COLONNE] = racine; periods[LIGNE] = 0; periods[COLONNE] = 0; reorder = 0; MPI_Cart_create(MPI_COMM_WORLD, 2, tab_dim, periods, reorder, &COMM_CART); // Récupération du rang dans le communicateur cartésien MPI_Comm_rank(COMM_CART, &rankCart); // Récupération des coordonnées dans le communicateur cartésien MPI_Cart_coords(COMM_CART, rankCart, 2, coords); // Création du communicateur en ligne remain[LIGNE] = 1; remain[COLONNE] = 0; MPI_Cart_sub(COMM_CART, remain, &COMM_ROWS); // Récupération du rang dans le communicateur en ligne MPI_Comm_rank(COMM_ROWS, &rankRow); // Création du communicateur en colonne remain[LIGNE] = 0; remain[COLONNE] = 1; MPI_Cart_sub(COMM_CART, remain, &COMM_COLS); // Récupération du rang dans le communicateur en colonne MPI_Comm_rank(COMM_COLS, &rankCol); }
void create_grid(GRID_INFO * grid){ int old_rank; int size; MPI_Comm_rank(MPI_COMM_WORLD,&old_rank); #ifdef DEBUG_MODE if(old_rank == 0) { DEBUG("Creating grid ...") } #endif /* Setting up order of grid */ MPI_Comm_size(MPI_COMM_WORLD,&size); grid->processes = size; grid->grid_order = sqrt(size); #ifdef DEBUG_MODE if(old_rank == 0){ DEBUGA(" Order %d",grid->grid_order)}; #endif int dimensions[2] = {grid->grid_order,grid->grid_order}; int periods[2] = {1,1}; /* Creating the grid */ MPI_Cart_create(MPI_COMM_WORLD,2,dimensions,periods,1, &(grid->grid_comm)); /* Get rank in the grid */ MPI_Comm_rank(grid->grid_comm,&(grid->rank)); /* Get coordinates in the grid */ int coord[2]; MPI_Cart_coords(grid->grid_comm,grid->rank, 2, coord); grid->row = coord[0]; grid->col = coord[1]; #ifdef DEBUG_MODE if(old_rank == 0) { DEBUG(" Creating row communicator") } #endif /* Creating row communicator */ int variable_coord[2] = {0,1}; MPI_Cart_sub(grid->grid_comm,variable_coord,&(grid->row_comm)); #ifdef DEBUG_MODE if(old_rank == 0) { DEBUG(" Creating col communicator") } #endif /* Creating column communicator */ variable_coord[0] = 1; variable_coord[1] = 0; MPI_Cart_sub(grid->grid_comm,variable_coord,&(grid->col_comm)); #ifdef DEBUG_MODE if(old_rank == 0) { DEBUG("Grid created.") } #endif }
void split_communicator(MPI_Comm comm, MPI_Comm cart[], int P[]) { int wrap[]= {0,0}; int coor[2]; MPI_Comm gcart; MPI_Cart_create(comm,2,P,wrap,1,&gcart); //parameter 4: value 1: reorder MPI_Cart_get(gcart,2,P,wrap,coor); int rdim1[] = {0,1}, rdim2[] = {1,0}; MPI_Cart_sub(gcart, rdim1 , &cart[0]); MPI_Cart_sub(gcart, rdim2 , &cart[1]); }
void summa(MPI_Comm comm_cart, const int m_block, const int n_block, const int k_block, double A_local[], double B_local[], double C_local[]) { // determine my cart coords int coords[2]; MPI_Cart_coords(comm_cart, rank, 2, coords); const int my_row = coords[0]; const int my_col = coords[1]; int belongs[2]; // create row comms for A MPI_Comm row_comm; belongs[0] = 0; belongs[1] = 1; MPI_Cart_sub(comm_cart, belongs, &row_comm); // create col comms for B MPI_Comm col_comm; belongs[0] = 1; belongs[1] = 0; MPI_Cart_sub(comm_cart, belongs, &col_comm); /*int row_rank, col_rank; MPI_Comm_rank(row_comm, &row_rank); MPI_Comm_rank(col_comm, &col_rank); if(rank == 1) std::cout << "Rank: " << rank << "-->(" << my_col << "," << my_row << ") (" << col_rank << "," << row_rank << ")" << std::endl; //printf("Rank: %i-->(%i,%i)|(%i,%i)\n", rank, my_col, my_row, col_rank, row_rank);*/ double * A_saved = (double *) calloc(m_block * n_block, sizeof(double)); double * B_saved = (double *) calloc(n_block * k_block, sizeof(double)); double * C_tmp = (double *) calloc(m_block * k_block, sizeof(double)); memcpy(A_saved, A_local, m_block * n_block * sizeof(double)); memcpy(B_saved, B_local, n_block * k_block * sizeof(double)); int number_blocks = n / n_block; for(int broadcaster = 0; broadcaster < number_blocks; ++broadcaster){ if (my_col == broadcaster) { memcpy(A_local, A_saved, m_block * n_block * sizeof(double)); } MPI_Bcast(A_local, m_block * n_block, MPI_DOUBLE, broadcaster, row_comm); if (my_row == broadcaster) { memcpy(B_local, B_saved, n_block * k_block * sizeof(double)); } MPI_Bcast(B_local, n_block * k_block, MPI_DOUBLE, broadcaster, col_comm); multMatricesLineByLine(m_block, n_block, k_block, A_local, B_local, C_tmp); sumMatrix(m_block, n_block, C_local, C_tmp, C_local); } }
//define the cartesian grid void create_MPI_cartesian_grid() { #ifdef USE_MPI coords periods; for(int mu=0;mu<NDIM;mu++) periods[mu]=1; MPI_Cart_create(MPI_COMM_WORLD,NDIM,nrank_dir,periods,1,&cart_comm); //takes rank and ccord of local rank MPI_Comm_rank(cart_comm,&cart_rank); MPI_Cart_coords(cart_comm,cart_rank,NDIM,rank_coord); //create communicator along plan for(int mu=0;mu<NDIM;mu++) { coords split_plan; coords proj_rank_coord; for(int nu=0;nu<NDIM;nu++) { split_plan[nu]=(nu==mu) ? 0 : 1; proj_rank_coord[nu]=(nu==mu) ? 0 : rank_coord[nu]; } MPI_Cart_sub(cart_comm,split_plan,&(plan_comm[mu])); MPI_Comm_rank(plan_comm[mu],&(plan_rank[mu])); if(plan_rank[mu]!=rank_of_coord(proj_rank_coord)) crash("Plan communicator has messed up coord: %d and rank %d (implement reorder!)", rank_of_coord(proj_rank_coord),plan_rank[mu]); } //create communicator along line for(int mu=0;mu<NDIM;mu++) { //split the communicator coords split_line; memset(split_line,0,sizeof(coords)); split_line[mu]=1; MPI_Cart_sub(cart_comm,split_line,&(line_comm[mu])); //get rank id MPI_Comm_rank(line_comm[mu],&(line_rank[mu])); //get rank coord along line comm MPI_Cart_coords(line_comm[mu],line_rank[mu],1,&(line_coord_rank[mu])); //check communicator if(line_rank[mu]!=rank_coord[mu] || line_rank[mu]!=line_coord_rank[mu]) crash("Line communicator has messed up coord and rank (implement reorder!)"); } #else cart_rank=plan_rank=line_rank=0; for(int mu=0;mu<NDIM;mu++) rank_coord[mu]=planline_coord[mu]=0; #endif }
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance) { // allocate redistributed vector struct mpla_vector x_redist; mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count); // redistribute input vector with row-block parallel distribution to column-block parallel distribution mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance); // generic computation core: matrix-vector product mpla_dgemv_core(b, A, &x_redist, instance); // create sub-communicator for each process row int remain_dims[2]; remain_dims[0]=0; remain_dims[1]=1; MPI_Comm row_comm; MPI_Cart_sub(instance->comm, remain_dims, &row_comm); // summation of block row results double* sum; cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm); cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice); // cleanup cudaFree(sum); mpla_free_vector(&x_redist, instance); MPI_Comm_free(&row_comm); }
void mpi_cart_sub_f(MPI_Fint *comm, ompi_fortran_logical_t *remain_dims, MPI_Fint *new_comm, MPI_Fint *ierr) { MPI_Comm c_comm, c_new_comm; /* * Just in the case, when sizeof(logical)!=sizeof(int) and * Fortran TRUE-value != 1, we have to convert -- then we need * to know the number of dimensions, for the size of remain_dims */ #if OMPI_FORTRAN_MUST_CONVERT_LOGICAL_2_INT == 1 int ndims; #endif OMPI_LOGICAL_ARRAY_NAME_DECL(remain_dims); c_comm = MPI_Comm_f2c(*comm); c_new_comm = MPI_Comm_f2c(*new_comm); #if OMPI_FORTRAN_MUST_CONVERT_LOGICAL_2_INT == 1 *ierr = OMPI_INT_2_FINT(MPI_Cartdim_get(c_comm, &ndims)); if (MPI_SUCCESS != OMPI_FINT_2_INT(*ierr)) { return; } #endif OMPI_ARRAY_LOGICAL_2_INT(remain_dims, ndims); *ierr = OMPI_INT_2_FINT(MPI_Cart_sub(c_comm, OMPI_LOGICAL_ARRAY_NAME_CONVERT(remain_dims), &c_new_comm)); if (MPI_SUCCESS == OMPI_FINT_2_INT(*ierr)) { *new_comm = MPI_Comm_c2f(c_new_comm); } OMPI_ARRAY_INT_2_LOGICAL(remain_dims, ndims); }
void mpla_copy_distributed_vector_to_cpu(double* x_cpu, struct mpla_vector* x, struct mpla_instance* instance) { // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); int column_rank; MPI_Comm_rank(column_comm, &column_rank); // columnwise creation of the full vector double* full_vector = x_cpu; int* recvcounts = new int[instance->proc_rows]; int* displs = new int[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { recvcounts[i] = x->proc_row_count[i][instance->cur_proc_col]; displs[i] = x->proc_row_offset[i][instance->cur_proc_col]; } // cudaMalloc((void**)&full_vector, sizeof(double)*x->vec_row_count); // cudaThreadSynchronize(); // checkCUDAError("cudaMalloc"); MPI_Allgatherv(x->data, x->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm); // memory cleanup MPI_Comm_free(&column_comm); MPI_Barrier(instance->comm); }
void mpi_manager_2D::get_SubComms() { //! Obtain ranks and communicators for 1D int remain_dims[2]; // x-direction: remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(comm2d, remain_dims, &comm_line_x); MPI_Comm_rank(comm_line_x, &rank_line_x); // y-direction remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(comm2d, remain_dims, &comm_line_y); MPI_Comm_rank(comm_line_y, &rank_line_y); }
void mpif_cart_sub_(MPI_Fint *old_comm, int *belongs, MPI_Fint *new_comm, int *error) { MPI_Comm old_comm_c = MPI_Comm_f2c(*old_comm); MPI_Comm new_comm_c; *error = MPI_Cart_sub(old_comm_c, belongs, &new_comm_c); *new_comm = MPI_Comm_c2f(new_comm_c); }
int main (int argc, char** argv) { int num_tasks; char hostname[80]; int dims[DIM]; dims[0] = DIM_0; dims[1] = DIM_1; dims[2] = DIM_2; int periods[DIM] = {false, false, false}; int reorder = true; int my_rank; int coords[DIM]; MPI_Comm cartcomm, y_comm; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); MPI_Comm_size(MPI_COMM_WORLD, &num_tasks); if (num_tasks != SIZE) { if (my_rank == 0) { printf("We need %d proccesses, %d given. Exiting.\n", SIZE, num_tasks); } MPI_Finalize(); return 0; } gethostname(hostname, 79); MPI_Cart_create(MPI_COMM_WORLD, DIM, dims, periods, reorder, &cartcomm); MPI_Cart_coords(cartcomm, my_rank, DIM, coords); printf("%-15.12s: MPI_COMM_WORLD rank %2d: (%d, %d, %d)\n", hostname, my_rank, coords[0], coords[1], coords[2]); //neighbors int src, dest; for (int i = 0; i < 3; i++) { MPI_Cart_shift(cartcomm, i, +1, src, dest); printf("i am %d and my right neighbor in %d is %d", dest, i, src); MPI_Cart_shift(cartcomm, i, -1, src, dest); printf("i am %d and my left neighbor in %d is %d", dest, i, src); } int keep_dims[1]; keep_dims[0] = 1; MPI_Cart_sub(cartcomm, keep_dims, &y_comm); printf("%d: my y rank is %d", my_rank, coords[1]); MPI_Finalize(); return 0; }
/** * \brief Given a communicator for a 2D process grid, this routine * returns a new communicator consisting only of the process-grid row * in which the calling process belongs. * * Example: If the process grid is 2 x 3, e.g., * * (0, 0) | (0, 1) | (0, 2) * (1, 0) | (1, 1) | (1, 2) * * and process (1, 1) calls this routine, then the routine will * return a communicator containing the processes, {(1, 0), (1, 1), * (1, 2)}. */ static MPI_Comm getCommRow__ (MPI_Comm comm2d) { int select[2] = {0, 1}; MPI_Comm comm_row; MPI_Cart_sub (comm2d, select, &comm_row); return comm_row; }
/** * \brief Given a communicator for a 2D process grid, this routine * returns a new communicator consisting only of the process-grid * column in which the calling process belongs. * * Example: If the process grid is 2 x 3, e.g., * * (0, 0) | (0, 1) | (0, 2) * (1, 0) | (1, 1) | (1, 2) * * and process (1, 1) calls this routine, then the routine will * return a communicator containing the processes, {(0, 0), (1, 1)}. */ static MPI_Comm getCommCol__ (MPI_Comm comm2d) { int select[2] = {1, 0}; MPI_Comm comm_col; MPI_Cart_sub (comm2d, select, &comm_col); return comm_col; }
void Setup_grid( GRID_INFO_T* grid /* out */) { int old_rank; int dimensions[2]; int wrap_around[2]; int coordinates[2]; int free_coords[2]; /* Set up Global Grid Information */ MPI_Comm_size(MPI_COMM_WORLD, &(grid->p)); MPI_Comm_rank(MPI_COMM_WORLD, &old_rank); /* We assume p is a perfect square */ grid->q = (int) sqrt((double) grid->p); dimensions[0] = dimensions[1] = grid->q; /* We want a circular shift in second dimension. */ /* Don't care about first */ wrap_around[0] = wrap_around[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, wrap_around, 1, &(grid->comm)); MPI_Comm_rank(grid->comm, &(grid->my_rank)); MPI_Cart_coords(grid->comm, grid->my_rank, 2, coordinates); grid->my_row = coordinates[0]; grid->my_col = coordinates[1]; /* Set up row communicators */ free_coords[0] = 0; free_coords[1] = 1; MPI_Cart_sub(grid->comm, free_coords, &(grid->row_comm)); /* Set up column communicators */ free_coords[0] = 1; free_coords[1] = 0; MPI_Cart_sub(grid->comm, free_coords, &(grid->col_comm)); } /* Setup_grid */
void Setup_grid(GRID_INFO_TYPE* grid){ int dimensions[2]; int periods[2]; int varying_coords[2]; MPI_Comm localname; MPI_Comm_size(MPI_COMM_WORLD, (&grid->p)); /* get the total number of processes */ grid->q=(int)sqrt((double) grid->p); /* square grid */ dimensions[0]=dimensions[1]=grid->q; /* dimension of the grid */ periods[0]=periods[1]=1; /* periodic if 1 and non-periodic if 0 */ MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &(grid->comm)); /* create a grid communicator from the "Comm_world" , having dimension 2, reorder the process to processor mapping */ MPI_Comm_rank(grid->comm, &(grid->my_rank)); MPI_Cart_coords(grid->comm, grid->my_rank,2,coordinates); grid->my_row=coordinates[0]; grid->my_col=coordinates[1]; varying_coords[0]=0;varying_coords[1]=1; MPI_Cart_sub(grid->comm,varying_coords,&(grid->row_comm)); varying_coords[0]=1;varying_coords[1]=0; MPI_Cart_sub(grid->comm,varying_coords,&(grid->col_comm)); }
int main(int argc, char **argv) { MPI_Init(&argc, &argv); int i, myrank, numranks, groupsize; int dims[3] = {0, 0, 0}; int temp[3] = {0, 0, 0}; int coord[3] = {0, 0, 0}; int periods[3] = {1, 1, 1}; double startTime, stopTime; MPI_Comm cartcomm, subcomm; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &numranks); dims[MP_X] = atoi(argv[1]); dims[MP_Y] = atoi(argv[2]); dims[MP_Z] = atoi(argv[3]); MPI_Dims_create(numranks, 3, dims); MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods, 1, &cartcomm); MPI_Cart_get(cartcomm, 3, dims, periods, coord); temp[MP_X] = 0; temp[MP_Y] = 1; temp[MP_Z] = 0; MPI_Cart_sub(cartcomm, temp, &subcomm); MPI_Comm_size(subcomm,&groupsize); int perrank = atoi(argv[4]); char *sendbuf = (char*)malloc(perrank*groupsize); char *recvbuf = (char*)malloc(perrank*groupsize); MPI_Barrier(cartcomm); MPI_Pcontrol(1); startTime = MPI_Wtime(); for (i=0; i<MAX_ITER; i++) { MPI_Alltoall(sendbuf, perrank, MPI_CHAR, recvbuf, perrank, MPI_CHAR, subcomm); } MPI_Barrier(cartcomm); stopTime = MPI_Wtime(); MPI_Pcontrol(0); if(myrank == 0) { printf("Completed %d iterations for subcom size %d, perrank %d\n", i, groupsize, perrank); printf("Time elapsed: %f\n", stopTime - startTime); } MPI_Finalize(); return 0; }
void TreeCommunicator::comm_create(const MPI_Comm &comm) { int num_dim = m_fan_out.size(); int color, key; MPI_Comm comm_cart; std::vector<int> flags(num_dim); std::vector<int> coords(num_dim); int rank_cart; memset(flags.data(), 0, sizeof(int)*num_dim); flags[0] = 1; check_mpi(MPI_Cart_create(comm, num_dim, m_fan_out.data(), flags.data(), 1, &comm_cart)); check_mpi(MPI_Comm_rank(comm_cart, &rank_cart)); check_mpi(MPI_Cart_coords(comm_cart, rank_cart, num_dim, coords.data())); check_mpi(MPI_Cart_sub(comm_cart, flags.data(), &(m_comm[0]))); for (int i = 1; i < num_dim; ++i) { if (coords[i-1] == 0) { color = 1; key = coords[i]; } else { color = MPI_UNDEFINED; key = 0; } check_mpi(MPI_Comm_split(comm_cart, color, key, &(m_comm[i]))); } check_mpi(MPI_Comm_free(&comm_cart)); m_num_level = 0; for (auto comm_it = m_comm.begin(); comm_it != m_comm.end() && *comm_it != MPI_COMM_NULL; ++comm_it) { m_num_level++; } m_comm.resize(m_num_level); if (m_global_policy) { m_num_level++; } if (rank_cart == 0 && m_global_policy == NULL) { throw Exception("process at root of tree communicator has not mapped the control file", GEOPM_ERROR_CTL_COMM, __FILE__, __LINE__); } if (rank_cart != 0 && m_global_policy != NULL) { throw Exception("process not at root of tree communicator has mapped the control file", GEOPM_ERROR_CTL_COMM, __FILE__, __LINE__); } }
void mpla_save_vector(struct mpla_vector* x, char* filename, struct mpla_instance* instance) { // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); int column_rank; MPI_Comm_rank(column_comm, &column_rank); // columnwise creation of the full vector double* full_vector; int* recvcounts = new int[instance->proc_rows]; int* displs = new int[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { recvcounts[i] = x->proc_row_count[i][instance->cur_proc_col]; displs[i] = x->proc_row_offset[i][instance->cur_proc_col]; } cudaMalloc((void**)&full_vector, sizeof(double)*x->vec_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allgatherv(x->data, x->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm); // writing full vector to file on parent process if (instance->is_parent) { FILE* f = fopen(filename, "wb"); double* full_vector_host = new double[x->vec_row_count]; cudaMemcpy(full_vector_host, full_vector, x->vec_row_count*sizeof(double), cudaMemcpyDeviceToHost); fwrite(&(x->vec_row_count), sizeof(int), 1, f); fwrite(full_vector_host, sizeof(double), x->vec_row_count, f); fclose(f); delete [] full_vector_host; } // memory cleanup cudaFree(full_vector); MPI_Comm_free(&column_comm); MPI_Barrier(instance->comm); }
static void extract_comm_1d( int dim, MPI_Comm comm_cart, MPI_Comm *comm_1d ) { int ndims, *remain_dims; MPI_Cartdim_get(comm_cart, &ndims); remain_dims = (int *) malloc(sizeof(int) * (size_t) ndims); for(int t=0; t<ndims; t++) remain_dims[t] = (t==dim) ? 1 : 0; MPI_Cart_sub(comm_cart, remain_dims, comm_1d); free(remain_dims); }
void mpla_ddot(double* xy, struct mpla_vector* x, struct mpla_vector* y, struct mpla_instance* instance) { // compute process-wise dot product double xy_tmp; cublasDdot(instance->cublas_handle, x->cur_proc_row_count, x->data, 1, y->data, 1, &xy_tmp); // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); // parallel summation and communication MPI_Allreduce(&xy_tmp, xy, 1, MPI_DOUBLE, MPI_SUM, column_comm); MPI_Comm_free(&column_comm); }
FORT_DLL_SPEC void FORT_CALL mpi_cart_sub_ ( MPI_Fint *v1, MPI_Fint v2[], MPI_Fint *v3, MPI_Fint *ierr ){ int _ctsize; int *l2=0; {int _topotype; PMPI_Topo_test( (MPI_Comm)*v1, &_topotype ); if (_topotype != MPI_CART) { _ctsize = 0; } else PMPI_Cartdim_get( (MPI_Comm)*v1, &_ctsize ); } if (_ctsize) {int li; l2 = (int *)MPL_malloc(_ctsize * sizeof(int), MPL_MEM_OTHER); for (li=0; li<_ctsize; li++) { l2[li] = MPII_FROM_FLOG(v2[li]); } } *ierr = MPI_Cart_sub( (MPI_Comm)(*v1), l2, (MPI_Comm *)(v3) ); if (l2) { MPL_free( l2 ); } }
void mpla_redistribute_vector_for_generic_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_instance* instance) { // attention: this code does no correctness check for the input data // WARNING: The following code is not efficient for a strong parallelization !!!!! // create sub-communicator for each process column int remain_dims[2]; remain_dims[0]=1; remain_dims[1]=0; MPI_Comm column_comm; MPI_Cart_sub(instance->comm, remain_dims, &column_comm); int column_rank; MPI_Comm_rank(column_comm, &column_rank); // columnwise creation of the full vector double* full_vector; int* recvcounts = new int[instance->proc_rows]; int* displs = new int[instance->proc_rows]; for (int i=0; i<instance->proc_rows; i++) { recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col]; displs[i] = b->proc_row_offset[i][instance->cur_proc_col]; } cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count); cudaThreadSynchronize(); checkCUDAError("cudaMalloc"); MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm); // extract column-wise local part of full vector cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice); // memory cleanup cudaFree(full_vector); MPI_Comm_free(&column_comm); }
int main (int argc, char **argv) { FILE *fp; double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL; double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL; int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size; int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size; int rank, size, sqrt_size, matrices_a_b_dimensions[4]; MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator; MPI_Status status; MPI_Request request1,request2; // used to manage the cartesian grid int dimensions[2], periods[2], coordinates[2], remain_dims[2]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* For square mesh */ sqrt_size = (int)sqrt((double) size); if(sqrt_size * sqrt_size != size){ if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n"); MPI_Abort(MPI_COMM_WORLD, -1); } // create a 2D cartesian grid dimensions[0] = dimensions[1] = sqrt_size; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator); MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); // create a row communicator remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator); // create a column communicator remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator); // getting matrices from files at rank 0 only // example: mpiexec -n 64 ./cannon matrix1 matrix2 [test] if (rank == 0){ int row, column; if ((fp = fopen (argv[1], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]); A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *)); for (row = 0; row < matrices_a_b_dimensions[0]; row++){ A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double)); for (column = 0; column < matrices_a_b_dimensions[1]; column++) fscanf(fp, "%lf", &A[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, -1); } if((fp = fopen (argv[2], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]); B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *)); for(row = 0; row < matrices_a_b_dimensions[2]; row++){ B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *)); for(column = 0; column < matrices_a_b_dimensions[3]; column++) fscanf(fp, "%lf", &B[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // need to check that the multiplication is possible given dimensions // matrices_a_b_dimensions[0] = row size of A // matrices_a_b_dimensions[1] = column size of A // matrices_a_b_dimensions[2] = row size of B // matrices_a_b_dimensions[3] = column size of B if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){ if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // this implementation is limited to cases where thematrices can be partitioned perfectly if( matrices_a_b_dimensions[0] % sqrt_size != 0 || matrices_a_b_dimensions[1] % sqrt_size != 0 || matrices_a_b_dimensions[2] % sqrt_size != 0 || matrices_a_b_dimensions[3] % sqrt_size != 0 ){ if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processe\n" "all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n", matrices_a_b_dimensions[0],matrices_a_b_dimensions[1], matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size ); MPI_Abort(MPI_COMM_WORLD, -1); } } // send dimensions to all peers if(rank == 0) { int i; for(i = 1; i < size; i++){ MPI_Send(matrices_a_b_dimensions, 4, MPI_INT, i, 0, cartesian_grid_communicator); } } else { MPI_Recv(matrices_a_b_dimensions, 4, MPI_INT, 0, 0, cartesian_grid_communicator, &status); } A_rows = matrices_a_b_dimensions[0]; A_columns = matrices_a_b_dimensions[1]; B_rows = matrices_a_b_dimensions[2]; B_columns = matrices_a_b_dimensions[3]; // local metadata for A A_local_block_rows = A_rows / sqrt_size; A_local_block_columns = A_columns / sqrt_size; A_local_block_size = A_local_block_rows * A_local_block_columns; A_local_block = (double *) malloc (A_local_block_size * sizeof(double)); // local metadata for B B_local_block_rows = B_rows / sqrt_size; B_local_block_columns = B_columns / sqrt_size; B_local_block_size = B_local_block_rows * B_local_block_columns; B_local_block = (double *) malloc (B_local_block_size * sizeof(double)); // local metadata for C C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double)); // C needs to be initialized at 0 (accumulates partial dot-products) int i; for(i=0; i < A_local_block_rows * B_local_block_columns; i++){ C_local_block[i] = 0; } // full arrays only needed at root if(rank == 0){ A_array = (double *) malloc(sizeof(double) * A_rows * A_columns); B_array = (double *) malloc(sizeof(double) * B_rows * B_columns); C_array = (double *) malloc(sizeof(double) * A_rows * B_columns); // generate the 1D arrays of the matrices at root int row, column, i, j; for (i = 0; i < sqrt_size; i++){ for (j = 0; j < sqrt_size; j++){ for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < A_local_block_columns; column++){ A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = A[i * A_local_block_rows + row][j * A_local_block_columns + column]; } } for (row = 0; row < B_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = B[i * B_local_block_rows + row][j * B_local_block_columns + column]; } } } } // allocate output matrix C C = (double **) malloc(A_rows * sizeof(double *)); for(i=0; i<A_rows ;i++){ C[i] = (double *) malloc(B_columns * sizeof(double)); } } // send a block to each process if(rank == 0) { int i; for(i = 1; i < size; i++){ MPI_Send((A_array + (i * A_local_block_size)), A_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator); MPI_Send((B_array + (i * B_local_block_size)), B_local_block_size, MPI_DOUBLE, i, 0, cartesian_grid_communicator); } for(i = 0; i < A_local_block_size; i++){ A_local_block[i] = A_array[i]; } for(i = 0; i < B_local_block_size; i++){ B_local_block[i] = B_array[i]; } } else { MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status); MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, 0, 0, cartesian_grid_communicator, &status); } //for(int r=0;r<size;r++){ // if(rank==15){ // int i; // for(i = 0; i < A_local_block_rows*B_local_block_columns;i++) { // // printf ("%7.3f ", A_local_block[i]); // } // } //} // MPI_Barrier(cartesian_grid_communicator); // cannon's algorithm int cannon_block_cycle; double compute_time = 0, mpi_time = 0, start; int C_index, A_row, A_column, B_column; for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){ //Asynchronus Send! start = MPI_Wtime(); MPI_Isend(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - 1) % sqrt_size, 0, row_communicator, &request1); MPI_Isend(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - 1) % sqrt_size, 0, column_communicator, &request2); mpi_time += MPI_Wtime() - start; // compute partial result for this block cycle start = MPI_Wtime(); for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){ //MPI_Probe((coordinates[1] + 1) % sqrt_size,0,row_communicator,&status ); //MPI_Probe((coordinates[0] + 1) % sqrt_size,0,column_communicator,&status ); for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){ for(A_column = 0; A_column < A_local_block_columns; A_column++){ C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * B_local_block[A_column * B_local_block_columns + B_column]; } } } compute_time += MPI_Wtime() - start; start = MPI_Wtime(); MPI_Wait(&request1,&status); MPI_Wait(&request2,&status); MPI_Recv(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status); MPI_Recv(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status); mpi_time += MPI_Wtime() - start; } // get C parts from other processes at rank 0 if(rank == 0) { for(i = 0; i < A_local_block_rows * B_local_block_columns; i++){ C_array[i] = C_local_block[i]; } int i; for(i = 1; i < size; i++){ MPI_Recv(C_array + (i * A_local_block_rows * B_local_block_columns), A_local_block_rows * B_local_block_columns, MPI_DOUBLE, i, 0, cartesian_grid_communicator, &status); } } else { MPI_Send(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, 0, cartesian_grid_communicator); } // generating output at rank 0 if (rank == 0) { // convert the ID array into the actual C matrix int i, j, k, row, column; for (i = 0; i < sqrt_size; i++){ // block row index for (j = 0; j < sqrt_size; j++){ // block column index for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) + (row * B_local_block_columns) + column]; } } } } printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns); printf("Computation time: %lf\n", compute_time); printf("MPI time: %lf\n", mpi_time); if (argc == 4){ // present results on the screen printf("\nA( %d x %d ):\n", A_rows, A_columns); for(row = 0; row < A_rows; row++) { for(column = 0; column < A_columns; column++) printf ("%7.3f ", A[row][column]); printf ("\n"); } printf("\nB( %d x %d ):\n", B_rows, B_columns); for(row = 0; row < B_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ", B[row][column]); printf("\n"); } printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns); for(row = 0; row < A_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ",C[row][column]); printf("\n"); } printf("\nPerforming serial consistency check. Be patient...\n"); fflush(stdout); int pass = 1; double temp; for(i=0; i<A_rows; i++){ for(j=0; j<B_columns; j++){ temp = 0; for(k=0; k<B_rows; k++){ temp += A[i][k] * B[k][j]; } printf("%7.3f ", temp); if(temp != C[i][j]){ pass = 0; } } printf("\n"); } if (pass) printf("Consistency check: PASS\n"); else printf("Consistency check: FAIL\n"); } } // free all memory if(rank == 0){ int i; for(i = 0; i < A_rows; i++){ free(A[i]); } for(i = 0; i < B_rows; i++){ free(B[i]); } for(i = 0; i < A_rows; i++){ free(C[i]); } free(A); free(B); free(C); free(A_array); free(B_array); free(C_array); } free(A_local_block); free(B_local_block); free(C_local_block); // finalize MPI MPI_Finalize(); }
void SUMMA(MPI_Comm comm_cart, const int mb, const int nb, const int kb, double *A_loc, double *B_loc, double *C_loc) { // determine my cart coords int coords[2]; MPI_Cart_coords(comm_cart, myrank, 2, coords); int my_col = coords[0]; int my_row = coords[1]; MPI_Comm row_comm; MPI_Comm col_comm; int remain_dims[2]; // create row comms for A remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(comm_cart, remain_dims, &row_comm); // create col comms for B remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(comm_cart, remain_dims, &col_comm); double *A_loc_save = (double *) calloc(mb*nb, sizeof(double)); double *B_loc_save = (double *) calloc(nb*kb, sizeof(double)); double *C_loc_tmp = (double *) calloc(mb*kb, sizeof(double)); // each proc should save its own A_loc, B_loc memcpy(A_loc_save, A_loc, mb*nb*sizeof(double)); memcpy(B_loc_save, B_loc, nb*kb*sizeof(double)); // C_loc = 0.0 memset(C_loc, 0, mb*kb*sizeof(double)); int nblks = n / nb; // ======== YOUR CODE HERE ============================ // Implement main SUMMA loop here: // root column (or row) should loop though nblks columns (rows). // // If processor's column coordinate equals to root, it should broadcast // its local portion of A within its `row_comm` communicator. // // If processor's row coordinate equals to root, it should broadcast // its local portion of B within its `col_comm` communicator. // // After broadcasting, call multiply_naive to multiply local portions // which each processor have received from others // and store it in partial sum `C_loc_tmp`. // // Finally, accumulate partials sums of `C_loc_tmp` to `C_loc` on each iteration // using `plus_matrix` function. // // Tip: MPI_Bcast function uses same pointer to buffer on all processors, // but initially on root processor it contains necessary data, and receivers will // get data during MPI_Bcast. Be sure not to overwrite each proc's local matrix // during these operations. This is why we saved local parts in // `A_loc_save` and `B_loc_save` in advance. // // // Sample solution: // for (int bcast_root = 0; bcast_root < nblks; ++bcast_root) { // // int root_col = bcast_root; // int root_row = bcast_root; // // // owner of A_loc[root_col,:] will broadcast its block within row comm // if (my_col == root_col) { // // copy A_loc_save to A_loc // } // // broadcast A_loc from root_col within row_comm // // // owner of B_loc[:,root_row] will broadcast its block within col comm // if (my_row == root_row) { // // copy B_loc_cave to B_loc // } // // broadcast B_loc from root_row within col_comm // // // multiply local blocks A_loc, B_loc using matmul_naive // // and store in C_loc_tmp // // // C_loc = C_loc + C_loc_tmp using plus_matrix //} // ==================================================== free(A_loc_save); free(B_loc_save); free(C_loc_tmp); }
int main( int argc, char **argv ) { int rank, size, i; int errors=0; int dims[NUM_DIMS]; int periods[NUM_DIMS]; int coords[NUM_DIMS]; int new_coords[NUM_DIMS]; int reorder = 1; MPI_Comm comm_temp, comm_cart, new_comm; int topo_status; int ndims; int new_rank; int remain_dims[NUM_DIMS]; int newnewrank; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); /* Clear dims array and get dims for topology */ for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; } MPI_Dims_create ( size, NUM_DIMS, dims ); /* Make a new communicator with a topology */ MPI_Cart_create ( MPI_COMM_WORLD, 2, dims, periods, reorder, &comm_temp ); MPI_Comm_dup ( comm_temp, &comm_cart ); /* Determine the status of the new communicator */ MPI_Topo_test ( comm_cart, &topo_status ); if (topo_status != MPI_CART) { printf( "topo_status of duped comm is not MPI_CART\n" ); errors++; } /* How many dims do we have? */ MPI_Cartdim_get( comm_cart, &ndims ); if ( ndims != NUM_DIMS ) { printf( "Number of dims of duped comm (%d) should be %d\n", ndims, NUM_DIMS ); errors++; } /* Get the topology, does it agree with what we put in? */ for(i=0;i<NUM_DIMS;i++) { dims[i] = 0; periods[i] = 0; } MPI_Cart_get ( comm_cart, NUM_DIMS, dims, periods, coords ); /* Does the mapping from coords to rank work? */ MPI_Cart_rank ( comm_cart, coords, &new_rank ); if ( new_rank != rank ) { printf( "New rank of duped comm (%d) != old rank (%d)\n", new_rank, rank ); errors++; } /* Does the mapping from rank to coords work */ MPI_Cart_coords ( comm_cart, rank, NUM_DIMS, new_coords ); for (i=0;i<NUM_DIMS;i++) if ( coords[i] != new_coords[i] ) { printf( "Old coords[%d] of duped comm (%d) != new_coords (%d)\n", i, coords[i], new_coords[i] ); errors++; } /* Let's shift in each dimension and see how it works! */ /* Because it's late and I'm tired, I'm not making this */ /* automatically test itself. */ for (i=0;i<NUM_DIMS;i++) { int source, dest; MPI_Cart_shift(comm_cart, i, 1, &source, &dest); #ifdef VERBOSE printf ("[%d] Shifting %d in the %d dimension\n",rank,1,i); printf ("[%d] source = %d dest = %d\n",rank,source,dest); #endif } /* Subdivide */ remain_dims[0] = 0; for (i=1; i<NUM_DIMS; i++) remain_dims[i] = 1; MPI_Cart_sub ( comm_cart, remain_dims, &new_comm ); /* Determine the status of the new communicator */ MPI_Topo_test ( new_comm, &topo_status ); if (topo_status != MPI_CART) { printf( "topo_status of cartsub comm is not MPI_CART\n" ); errors++; } /* How many dims do we have? */ MPI_Cartdim_get( new_comm, &ndims ); if ( ndims != NUM_DIMS-1 ) { printf( "Number of dims of cartsub comm (%d) should be %d\n", ndims, NUM_DIMS-1 ); errors++; } /* Get the topology, does it agree with what we put in? */ for(i=0;i<NUM_DIMS-1;i++) { dims[i] = 0; periods[i] = 0; } MPI_Cart_get ( new_comm, ndims, dims, periods, coords ); /* Does the mapping from coords to rank work? */ MPI_Comm_rank ( new_comm, &newnewrank ); MPI_Cart_rank ( new_comm, coords, &new_rank ); if ( new_rank != newnewrank ) { printf( "New rank of cartsub comm (%d) != old rank (%d)\n", new_rank, newnewrank ); errors++; } /* Does the mapping from rank to coords work */ MPI_Cart_coords ( new_comm, new_rank, NUM_DIMS -1, new_coords ); for (i=0;i<NUM_DIMS-1;i++) if ( coords[i] != new_coords[i] ) { printf( "Old coords[%d] of cartsub comm (%d) != new_coords (%d)\n", i, coords[i], new_coords[i] ); errors++; } /* We're at the end */ MPI_Comm_free( &new_comm ); MPI_Comm_free( &comm_temp ); MPI_Comm_free( &comm_cart ); Test_Waitforall( ); if (errors) printf( "[%d] done with %d ERRORS!\n", rank,errors ); MPI_Finalize(); return 0; }
int main(int argc, char *argv[]) { int ncpus, mypid, nrem, ierr; MPI_Status mpistatus; MPI_Comm comm = MPI_COMM_WORLD; // Variables needed for the Cartesian topology. int ROW = 0, COL = 1; int dims[2], periods[2], keep_dims[2]; int my2dpid, mycoords[2], srcoords[2], otherpid; MPI_Comm comm_2d, comm_row, comm_col; // Initialize MPI. MPI_Init(&argc, &argv); MPI_Comm_size(comm, &ncpus); MPI_Comm_rank(comm, &mypid); if ( argc < 3 ) { printf ("ERROR: %s requires Cartesian dimensions input\n", argv[0]); return -1; } // Set up a Cartesian virtual topology and get the rank and coordinates of the processes in the topology. dims[ROW] = atoi(argv[1]); // Row dimension of the topology dims[COL] = atoi(argv[2]); // Column dimension of the topology if (dims[ROW]*dims[COL] != ncpus){ printf("ERROR: Row dim and col dim not equal to ncpus\n"); return -1; } periods[ROW] = periods[COL] = 1; // Set the periods for wrap-around MPI_Cart_create(comm, 2, dims, periods, 1, &comm_2d); MPI_Comm_rank(comm_2d, &my2dpid); //Get my pid in the new 2D topology MPI_Cart_coords(comm_2d, my2dpid, 2, mycoords); // Get my coordinates /* Create the row-based sub-topology */ keep_dims[ROW] = 0; keep_dims[COL] = 1; MPI_Cart_sub(comm_2d, keep_dims, &comm_row); /* Create the column-based sub-topology */ keep_dims[ROW] = 1; keep_dims[COL] = 0; MPI_Cart_sub(comm_2d, keep_dims, &comm_col); // STEP 1: Have processor (0,0) read in the entire set of 2D images, divide up the images, and send corresponding images to processors in processor group: g_c_0 Do the same for the angles. if (mycoords[ROW] == 0 && mycoords[COL] == 0){ //I'm processor (0,0) FILE *fp, *fpa; char imagefname[80]="tf2d84.raw", anglesfname[80]="angles.dat"; fp = fopen(imagefname,"r"); fread(&nangs, sizeof(int), 1, fp); fread(&nx, sizeof(int), 1, fp); fread(&ny, sizeof(int), 1, fp); images = new float[nx*ny*nangs]; fread(images, sizeof(float), nx*ny*nangs, fp); fclose(fp); fpa = fopen(anglesfname,"r"); angles = new float[3*nangs]; for (int i = 0; i< 3*nangs; i++) fscanf(fpa, "%f",&angles[i]); fclose(fpa); printf("There are %d 2D images of size %d x %d\n", nangs, nx, ny); } // Broadcast variables nangs, nx, ny to all processors srcoords[ROW] = srcoords[COL] = 0; MPI_Cart_rank(comm_2d, srcoords, &otherpid); MPI_Bcast (&nangs, 1, MPI_INT, otherpid, comm_2d); MPI_Bcast (&nx, 1, MPI_INT, otherpid, comm_2d); MPI_Bcast (&ny, 1, MPI_INT, otherpid, comm_2d); // Send images and angles from Processor (0,0) to processors in group g_c_0 int *psize = new int[dims[ROW]]; int *nbase = new int[dims[ROW]]; nangsloc = setpart_gc1(comm_2d, nangs, psize, nbase); imagesloc = new float[psize[mycoords[ROW]]*nx*ny]; reprojloc = new float[psize[mycoords[ROW]]*nx*ny]; anglesloc = new float[psize[mycoords[ROW]]*3]; // printf("My coords are (%d,%d) and nangsloc = %d\n", mycoords[ROW], mycoords[COL], nangsloc); if (mycoords[COL] == 0 && mycoords[ROW] == 0) { //I'm Proc. (0,0) for(int ip = 0; ip < dims[ROW]; ++ip){ int begidx = nbase[ip]*nx*ny; if (ip !=0){ // Proc (0,0) sends images and angle data to other processors srcoords[COL] = 0; srcoords[ROW] = ip; MPI_Cart_rank(comm_2d, srcoords, &otherpid); MPI_Send(&images[begidx],psize[ip]*nx*ny, MPI_FLOAT, otherpid, otherpid, comm_2d); MPI_Send(&angles[nbase[ip]*3],psize[ip]*3, MPI_FLOAT, otherpid, otherpid, comm_2d); } else{ // ip = 0: Proc (0,0) needs to copy images and angles into its imagesloc and anglesloc for (int i = 0; i < psize[ip]*nx*ny; i++){ imagesloc[i] = images[begidx+i]; } for (int i = 0; i < psize[ip]*3; i++){ anglesloc[i] = angles[nbase[ip]*3 + i]; } //printf("Finished copying to Proc (0,0) local"); } } //End for loop } //End if if (mycoords[COL] == 0 && mycoords[ROW] != 0) { //I'm in g_c_0 and I'm not Processor (0,0) so I should receive data. MPI_Recv(imagesloc, psize[mycoords[ROW]]*nx*ny, MPI_FLOAT, 0, mypid, comm_2d, &mpistatus); MPI_Recv(anglesloc, psize[mycoords[ROW]]*3, MPI_FLOAT, 0, mypid, comm_2d, &mpistatus); } // Now have all the processors in group g_c_0 broadcast the images and angles along the row communicator srcoords[ROW] = 0; MPI_Cart_rank(comm_row, srcoords, &otherpid); MPI_Bcast(imagesloc, nangsloc*nx*ny, MPI_FLOAT, otherpid , comm_row); MPI_Bcast(anglesloc, nangsloc*3, MPI_FLOAT, otherpid , comm_row); // Now distribute the volume (in spherical format) among columns of processors and use nnz to determine the splitting. Note: ptrs and coord are on all processors int radius; int volsize[3], origin[3]; volsize[0] = nx; volsize[1] = nx; volsize[2] = nx; origin[0] = nx/2+1; origin[1] = nx/2+1; origin[2] = nx/2+1; radius = nx/2-1; ierr = getnnz( volsize, radius, origin, &nrays, &nnz); int * ptrs = new int[nrays+1]; int * cord = new int[3*nrays]; ierr = getcb2sph(volsize, radius, origin, nnz, ptrs, cord); int *nnzpart = new int[dims[COL]]; int *nnzbase = new int[dims[COL]+1]; nnzloc = setpart_gr1(comm_2d, nnz, nnzpart, nnzbase); int *ptrstart = new int[dims[COL]+1]; nraysloc = sphpart(comm_2d, nrays, ptrs, nnzbase, ptrstart); myptrstart = ptrstart[mycoords[COL]]; int nnzall[dims[COL]]; for (int i = 0; i<dims[COL]; i++) nnzall[i] = ptrs[ptrstart[i+1]] - ptrs[ptrstart[i]]; nnzloc = nnzall[mycoords[COL]]; // Print some stuff. printf("My coords are (%d,%d) and nangsloc = %d, nraysloc = %d, myptrstart = %d, nnzloc = %d\n", mycoords[ROW], mycoords[COL], nangsloc, nraysloc, myptrstart, nnzloc); float *bvol_loc = new float[nnzloc]; float *vol_sphloc = new float[nnzloc]; for (int i=0; i< nnzloc; i++) bvol_loc[i] = 0.0; // STEP 2: Have everyone perform the backprojection operation for their assigned images and portions of the volume. Then perform an Allreduce along the columns. float phi, theta, psi; float dm[8]; for (int i=0; i<nangsloc; i++){ phi = anglesloc[3*i+0]; theta = anglesloc[3*i+1]; psi = anglesloc[3*i+2]; dm[6] = 0; dm[7] = 0; make_proj_mat(phi, theta, psi, dm); ierr = bckpj3_Cart(volsize, nraysloc, nnzloc, dm, origin, radius, ptrs, cord, myptrstart, &imagesloc[nx*ny*i], bvol_loc); } // Now an all reduce along the columns MPI_Allreduce (bvol_loc, vol_sphloc, nnzloc, MPI_FLOAT, MPI_SUM, comm_col); // For testing purposes, we bring all the portions of the volume back together onto Proc (0,0). Note: we only need to deal with the first row of processors. if (mycoords[COL] != 0 && mycoords[ROW] == 0) { //Send data to Processor (0,0) srcoords[COL] = srcoords[ROW] = 0; MPI_Cart_rank(comm_2d, srcoords, &otherpid); MPI_Send(vol_sphloc, nnzloc, MPI_FLOAT, otherpid, otherpid, comm_2d); } float *onevol_sph = new float[nnz]; if (mycoords[COL] == 0 && mycoords[ROW] ==0){ //Copy data and recieve data float *vol_sph = new float[nnz]; for (int i=0; i<nnzloc; i++) vol_sph[i] = vol_sphloc[i]; for (int i=1; i<dims[COL]; i++){ srcoords[ROW] = 0; srcoords[COL] = i; MPI_Cart_rank(comm_2d, srcoords, &otherpid); MPI_Recv(&vol_sph[ptrs[ptrstart[i]]-1], nnzall[i], MPI_FLOAT, otherpid, mypid, comm_2d, &mpistatus); } //printf("Finished combining all volume parts\n"); //Now compute the back projection serially on one processor (0,0) for (int i=0; i< nnz; i++) onevol_sph[i] = 0.0; for (int i=0; i<nangs; i++){ phi = angles[3*i+0]; theta = angles[3*i+1]; psi = angles[3*i+2]; dm[6] = 0; dm[7] = 0; make_proj_mat(phi, theta, psi, dm); ierr = bckpj3(volsize, nrays, nnz, dm, origin, radius, ptrs, cord, &images[nx*ny*i], onevol_sph); } float err=0; for (int i=0; i< nnz; i++){ err = err+(onevol_sph[i]-vol_sph[i])*(onevol_sph[i]-vol_sph[i]); } err = sqrt(err); printf("Cumulative error for backprojection is %f\n", err); delete [] vol_sph; } // STEP 3: Now perform a forward projection operation for the assigned images and portions of the volume. Then perform an all_reduce along the rows. float * newimagesloc = new float[nangsloc*nx*ny]; for (int i=0; i<nangsloc*nx*ny; i++) newimagesloc[i] = 0.0; for (int i=0; i<nangsloc; i++){ phi = anglesloc[3*i+0]; theta = anglesloc[3*i+1]; psi = anglesloc[3*i+2]; dm[6] = 0; dm[7] = 0; make_proj_mat(phi, theta, psi, dm); ierr = fwdpj3_Cart(volsize, nraysloc, nnzloc, dm, origin, radius, ptrs, cord, myptrstart, vol_sphloc, &newimagesloc[nx*ny*i]); } // Now an all reduce along the rows MPI_Allreduce (newimagesloc, reprojloc, nangsloc*nx*ny, MPI_FLOAT, MPI_SUM, comm_row); delete [] newimagesloc; // For testing purposes, we bring all the 2D images together onto Proc (0,0). Note: we only need to deal with the first column of processors. if (mycoords[ROW] != 0 && mycoords[COL] == 0) { //Send data to Processor (0,0) srcoords[COL] = srcoords[ROW] = 0; MPI_Cart_rank(comm_2d, srcoords, &otherpid); MPI_Send(reprojloc, nangsloc*nx*ny, MPI_FLOAT, otherpid, otherpid, comm_2d); } if (mycoords[COL] == 0 && mycoords[ROW] ==0){ //Copy data and recieve data float *reproj = new float[nangs*nx*ny]; for (int i=0; i<nangsloc*nx*ny; i++) reproj[i] = reprojloc[i]; for (int i=1; i<dims[ROW]; i++){ srcoords[COL] = 0; srcoords[ROW] = i; MPI_Cart_rank(comm_2d, srcoords, &otherpid); MPI_Recv(&reproj[nbase[i]*nx*ny], psize[i]*nx*ny, MPI_FLOAT, otherpid, mypid, comm_2d, &mpistatus); } delete [] reprojloc; // Now compute the forward projection serially on one processor (0,0) float *allimages = new float[nangs*nx*ny]; for (int i=0; i< nangs*nx*ny; i++) allimages[i] = 0.0; for (int i=0; i<nangs; i++){ phi = angles[3*i+0]; theta = angles[3*i+1]; psi = angles[3*i+2]; dm[6] = 0; dm[7] = 0; make_proj_mat(phi, theta, psi, dm); ierr = fwdpj3(volsize, nrays, nnz, dm, origin, radius, ptrs, cord, onevol_sph, &allimages[nx*ny*i]); } // Now compute the overall error. int idx; float err=0, max =0; for (int i=0; i< nangs*nx*ny; i++){ //if (allimages[i]!=reproj[i] && i < 256) // printf("i= %d\n",i); err = err+(allimages[i]-reproj[i])*(allimages[i]-reproj[i]); if (fabs(allimages[i]-reproj[i]) > max){ max = fabs(allimages[i]-reproj[i]); idx = i; } } err = sqrt(err); printf("Cumulative error for forward projection is %f with max error of %f occuring at %d\n", err, max, idx); printf("Max error: compare %f and %f\n",allimages[idx], reproj[idx]); delete [] reproj; delete [] allimages; delete [] angles; delete [] images; } delete [] onevol_sph; delete [] vol_sphloc; delete [] bvol_loc; delete [] ptrs; delete [] cord; delete [] nnzpart; delete [] nnzbase; delete [] ptrstart; delete [] anglesloc; delete [] imagesloc; delete [] nbase; delete [] psize; MPI_Comm_free(&comm_2d); MPI_Comm_free(&comm_row); MPI_Comm_free(&comm_col); MPI_Finalize(); }
MatrixVectorMultiply2D(int n, double *a, double *x, double *y, MPI_Comm comm_2d) { int ROW=0, COL=1; /* Improve readability for indices */ int i, j, nlocal; double *py; /* Will store partial dot products for y */ /* Variables are as follows: - npes = # of processing elements - dims = size of matrix in x and y dimensions - keep_dims = used to filter out dimensions when creating sub-topologies */ int npes, dims[2], periods[2], keep_dims[2], keep_dims2[2]; /* Other variables are used to create sub-topologies and refer to individual processing elements */ int myrank, mycoords[2], mycolrank, myrowrank; int source_rank, dest_rank, root_rank, col_rank, coords[2], coord[1]; MPI_Status status; MPI_Comm comm_row, comm_col; /* Get information about the communicator */ MPI_Comm_size(comm_2d, &npes); MPI_Comm_rank(comm_2d, &myrank); /* Compute the size of the square grid. If a square grid is not used, changes the values here */ dims[ROW] = dims[COL] = sqrt(npes); nlocal = n/dims[ROW]; /* Allocate memory for the array that will hold the partial dot-products */ py = malloc(nlocal*sizeof(double)); MPI_Cart_coords(comm_2d, myrank, 2, mycoords); /* Get my coordinates */ /*****************************************************/ /* Create the row-based sub-topology */ keep_dims[ROW] = 0; keep_dims[COL] = 1; MPI_Cart_sub(comm_2d, keep_dims, &comm_row); MPI_Comm_rank(comm_row, &myrowrank); /* Create the column-based sub-topology */ keep_dims2[ROW] = 1; keep_dims2[COL] = 0; MPI_Cart_sub(comm_2d, keep_dims2, &comm_col); MPI_Comm_rank(comm_col, &mycolrank); /****************************************/ /* Redistribute the x vector. */ /* Step 1. The processors along the rightmost column send their data to the diagonal processors */ /* If I'm in the rightmost column but not the last row, send my block of the vector to the diagonal processor in my row */ /*****************************************************/ /* printf("STEP1: I am processor %d at position (%d, %d)", */ /* myrank, mycoords[ROW], mycoords[COL]); */ // determine if in right column if (mycoords[COL] == dims[COL]-1){ // if in right column, then check for !in_final_row if (mycoords[ROW] != dims[ROW] - 1){ // if not, then send the info to the element located in the // 2d cartesian topology at location (i,j) where i == j. // Also, i will be equal to mycoords[ROW] // get rank of dest coords[ROW] = coords[COL] = mycoords[ROW]; MPI_Cart_rank(comm_2d, coords, &dest_rank); // send to rank MPI_Send(x, nlocal, MPI_DOUBLE, dest_rank, 0, comm_2d); } } /*****************************************************/ /* If I'm on the diagonal but not in the last row, receive the block of the vector from the processor in the rightmost column of my row */ /* printf("STEP1b: I am processor %d in at position (%d, %d)", */ /* myrank, mycoords[ROW], mycoords[COL]); */ if (mycoords[ROW] == mycoords[COL] && mycoords[ROW] != dims[ROW]-1){ // determine source_rank coords[ROW] = mycoords[ROW]; coords[COL] = dims[COL]-1; MPI_Cart_rank(comm_2d, coords, &source_rank); // receive data from source MPI_Recv(x, nlocal, MPI_DOUBLE, source_rank, 0, comm_2d, &status); } /*****************************************************/ /* Step 2. Perform a column-wise broadcast with the diagonal process as the root */ /*******************************************************/ /* printf("STEP2: I am processor %d at position (%d, %d)", */ /* myrank, mycoords[ROW], mycoords[COL]); */ // if diagonal element, just broadcast if (mycoords[ROW] == mycoords[COL]){ MPI_Bcast(x, nlocal, MPI_DOUBLE, mycolrank, comm_col); } else { // get rank of current column's diagonal element coord[0] = mycoords[COL]; MPI_Cart_rank(comm_col, coord, &col_rank); // get column based rank MPI_Bcast(x, nlocal, MPI_DOUBLE, col_rank, comm_col); } /* Perform local matrix-vector multiply */ for (i=0; i<nlocal; i++) { py[i] = 0.0; for (j=0; j<nlocal; j++) py[i] += a[i*nlocal+j]*x[j]; } /*****************************************************/ /* Step 3. Perform the sum-reduction along the rows to add up the partial dot-products and leave the result in the rightmost column */ /*****************************************************/ /* printf("STEP3: I am processor %d in the right column at position (%d, %d)", */ /* myrank, mycoords[ROW], mycoords[COL]); */ // check if this is the results column if (mycoords[COL] == dims[COL]-1){ // receive results from reduce MPI_Reduce(py, y, nlocal, MPI_DOUBLE, MPI_SUM, myrowrank, comm_row); } else{ // pass results to the right // determine rank of right-most column processor coord[ROW] = dims[ROW]-1; MPI_Cart_rank(comm_row, coord, &root_rank); MPI_Reduce(py, y, nlocal, MPI_DOUBLE, MPI_SUM, root_rank, comm_row); } /* free local communicators */ MPI_Comm_free(&comm_row); /* Free up communicator */ MPI_Comm_free(&comm_col); /* Free up communicator */ free(py); }
int main (int argc, char **argv) { int nprocs = -1; int rank = -1; int i, j; int *granks; char processor_name[128]; int namelen = 128; int buf[buf_size]; MPI_Status status; MPI_Comm temp; MPI_Comm intercomm = MPI_COMM_NULL; MPI_Comm dcomms[DCOMM_CALL_COUNT]; MPI_Group world_group, dgroup; int intersize, dnprocs[DCOMM_CALL_COUNT], drank[DCOMM_CALL_COUNT]; int dims[TWOD], periods[TWOD], remain_dims[TWOD]; int graph_index[] = { 2, 3, 4, 6 }; int graph_edges[] = { 1, 3, 0, 3, 0, 2 }; /* init */ MPI_Init (&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &nprocs); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Get_processor_name (processor_name, &namelen); printf ("(%d) is alive on %s\n", rank, processor_name); fflush (stdout); MPI_Barrier (MPI_COMM_WORLD); /* probably want number to be higher... */ if (nprocs < 4) { printf ("not enough tasks\n"); } else { if (DCOMM_CALL_COUNT > 0) { #ifdef RUN_COMM_DUP /* create all of the derived communicators... */ /* simplest is created by MPI_Comm_dup... */ MPI_Comm_dup (MPI_COMM_WORLD, &dcomms[0]); #else dcomms[0] = MPI_COMM_NULL; #endif } if (DCOMM_CALL_COUNT > 1) { #ifdef RUN_COMM_CREATE /* use subset of MPI_COMM_WORLD group for MPI_Comm_create... */ MPI_Comm_group (MPI_COMM_WORLD, &world_group); granks = (int *) malloc (sizeof(int) * (nprocs/2)); for (i = 0; i < nprocs/2; i++) granks [i] = 2 * i; MPI_Group_incl (world_group, nprocs/2, granks, &dgroup); MPI_Comm_create (MPI_COMM_WORLD, dgroup, &dcomms[1]); MPI_Group_free (&world_group); MPI_Group_free (&dgroup); free (granks); #else dcomms[1] = MPI_COMM_NULL; #endif } if (DCOMM_CALL_COUNT > 2) { #ifdef RUN_COMM_SPLIT /* split into thirds with inverted ranks... */ MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &dcomms[2]); #else dcomms[2] = MPI_COMM_NULL; #endif } #ifdef RUN_INTERCOMM_CREATE if ((DCOMM_CALL_COUNT < 2) || (dcomms[2] == MPI_COMM_NULL)) { MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &temp); } else { temp = dcomms[2]; } if (rank % 3) { MPI_Intercomm_create (temp, 0, MPI_COMM_WORLD, (((nprocs % 3) == 2) && ((rank % 3) == 2)) ? nprocs - 1 : nprocs - (rank % 3) - (nprocs % 3), INTERCOMM_CREATE_TAG, &intercomm); } if ((DCOMM_CALL_COUNT < 2) || (dcomms[2] == MPI_COMM_NULL)) { MPI_Comm_free (&temp); } #endif if (DCOMM_CALL_COUNT > 3) { #ifdef RUN_CART_CREATE /* create a 2 X nprocs/2 torus topology, allow reordering */ dims[0] = 2; dims[1] = nprocs/2; periods[0] = periods[1] = 1; MPI_Cart_create (MPI_COMM_WORLD, TWOD, dims, periods, 1, &dcomms[3]); #else dcomms[3] = MPI_COMM_NULL; #endif } if (DCOMM_CALL_COUNT > 4) { #ifdef RUN_GRAPH_CREATE /* create the graph on p.268 MPI: The Complete Reference... */ MPI_Graph_create (MPI_COMM_WORLD, GRAPH_SZ, graph_index, graph_edges, 1, &dcomms[4]); #else dcomms[4] = MPI_COMM_NULL; #endif } if (DCOMM_CALL_COUNT > 5) { #ifdef RUN_CART_SUB #ifndef RUN_CART_CREATE /* need to make cartesian communicator temporarily... */ /* create a 2 X nprocs/2 torus topology, allow reordering */ dims[0] = 2; dims[1] = nprocs/2; periods[0] = periods[1] = 1; MPI_Cart_create (MPI_COMM_WORLD, TWOD, dims, periods, 1, &dcomms[3]); #endif if (dcomms[3] != MPI_COMM_NULL) { /* create 2 1 X nprocs/2 topologies... */ remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub (dcomms[3], remain_dims, &dcomms[5]); #ifndef RUN_CART_CREATE /* free up temporarily created cartesian communicator... */ MPI_Comm_free (&dcomms[3]); #endif } else { dcomms[5] = MPI_COMM_NULL; } #else dcomms[5] = MPI_COMM_NULL; #endif } if (DCOMM_CALL_COUNT > 6) { #ifdef RUN_INTERCOMM_MERGE #ifndef RUN_INTERCOMM_CREATE #ifndef RUN_COMM_SPLIT /* need to make split communicator temporarily... */ /* split into thirds with inverted ranks... */ MPI_Comm_split (MPI_COMM_WORLD, rank % 3, nprocs - rank, &dcomms[2]); #endif #endif /* create an intercommunicator and merge it... */ if (rank % 3) { #ifndef RUN_INTERCOMM_CREATE MPI_Intercomm_create (dcomms[2], 0, MPI_COMM_WORLD, (((nprocs % 3) == 2) && ((rank % 3) == 2)) ? nprocs - 1 : nprocs - (rank % 3) - (nprocs % 3), INTERCOMM_CREATE_TAG, &intercomm); #endif MPI_Intercomm_merge (intercomm, ((rank % 3) == 1), &dcomms[6]); #ifndef RUN_INTERCOMM_CREATE /* we are done with intercomm... */ MPI_Comm_free (&intercomm); #endif } else { dcomms[6] = MPI_COMM_NULL; } #ifndef RUN_INTERCOMM_CREATE #ifndef RUN_COMM_SPLIT if (dcomms[2] != MPI_COMM_NULL) /* free up temporarily created split communicator... */ MPI_Comm_free (&dcomms[2]); #endif #endif #else dcomms[6] = MPI_COMM_NULL; #endif } /* get all of the sizes and ranks... */ for (i = 0; i < DCOMM_CALL_COUNT; i++) { if (dcomms[i] != MPI_COMM_NULL) { MPI_Comm_size (dcomms[i], &dnprocs[i]); MPI_Comm_rank (dcomms[i], &drank[i]); } else { dnprocs[i] = 0; drank[i] = -1; } } #ifdef RUN_INTERCOMM_CREATE /* get the intercomm remote size... */ if (rank % 3) { MPI_Comm_remote_size (intercomm, &intersize); } #endif /* do some point to point on all of the dcomms... */ for (i = 0; i < DCOMM_CALL_COUNT; i++) { if (dnprocs[i] > 1) { if (drank[i] == 0) { for (j = 1; j < dnprocs[i]; j++) { MPI_Recv (buf, buf_size, MPI_INT, j, 0, dcomms[i], &status); } } else { memset (buf, 1, buf_size*sizeof(int)); MPI_Send (buf, buf_size, MPI_INT, 0, 0, dcomms[i]); } } } #ifdef RUN_INTERCOMM_CREATE /* do some point to point on the intercomm... */ if ((rank % 3) == 1) { for (j = 0; j < intersize; j++) { MPI_Recv (buf, buf_size, MPI_INT, j, 0, intercomm, &status); } } else if ((rank % 3) == 2) { for (j = 0; j < intersize; j++) { memset (buf, 1, buf_size*sizeof(int)); MPI_Send (buf, buf_size, MPI_INT, j, 0, intercomm); } } #endif /* do a bcast on all of the dcomms... */ for (i = 0; i < DCOMM_CALL_COUNT; i++) { /* IBM's implementation gets error with comm over MPI_COMM_NULL... */ if (dnprocs[i] > 0) MPI_Bcast (buf, buf_size, MPI_INT, 0, dcomms[i]); } /* use any source receives... */ for (i = 0; i < DCOMM_CALL_COUNT; i++) { if (dnprocs[i] > 1) { if (drank[i] == 0) { for (j = 1; j < dnprocs[i]; j++) { MPI_Recv (buf, buf_size, MPI_INT, MPI_ANY_SOURCE, 0, dcomms[i], &status); } } else { memset (buf, 1, buf_size*sizeof(int)); MPI_Send (buf, buf_size, MPI_INT, 0, 0, dcomms[i]); } } } #ifdef RUN_INTERCOMM_CREATE /* do any source receives on the intercomm... */ if ((rank % 3) == 1) { for (j = 0; j < intersize; j++) { MPI_Recv (buf, buf_size, MPI_INT, MPI_ANY_SOURCE, 0, intercomm, &status); } } else if ((rank % 3) == 2) { for (j = 0; j < intersize; j++) { memset (buf, 1, buf_size*sizeof(int)); MPI_Send (buf, buf_size, MPI_INT, j, 0, intercomm); } } #endif /* do a barrier on all of the dcomms... */ for (i = 0; i < DCOMM_CALL_COUNT; i++) { /* IBM's implementation gets with communication over MPI_COMM_NULL... */ if (dnprocs[i] > 0) MPI_Barrier (dcomms[i]); } /* free all of the derived communicators... */ for (i = 0; i < DCOMM_CALL_COUNT; i++) { /* freeing MPI_COMM_NULL is explicitly defined as erroneous... */ if (dnprocs[i] > 0) MPI_Comm_free (&dcomms[i]); } #ifdef RUN_INTERCOMM_CREATE if (rank % 3) /* we are done with intercomm... */ MPI_Comm_free (&intercomm); #endif } MPI_Barrier (MPI_COMM_WORLD); MPI_Finalize (); printf ("(%d) Finished normally\n", rank); }