static void box_partition( int ip , int up , int axis , const int box[3][2] , int p_box[][3][2] ) { const int np = up - ip ; if ( 1 == np ) { p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ; p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ; p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ; } else { const int n = box[ axis ][1] - box[ axis ][0] ; const int np_low = np / 2 ; /* Rounded down */ const int np_upp = np - np_low ; const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np))); const int n_low = n - n_upp ; const int next_axis = ( axis + 2 ) % 3 ; if ( np_low ) { /* P = [ip,ip+np_low) */ int dbox[3][2] ; dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ; dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ; dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ; dbox[ axis ][1] = dbox[ axis ][0] + n_low ; box_partition( ip, ip + np_low, next_axis, (const int (*)[2]) dbox, p_box ); } if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */ int dbox[3][2] ; dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ; dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ; dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ; ip += np_low ; dbox[ axis ][0] += n_low ; dbox[ axis ][1] = dbox[ axis ][0] + n_upp ; box_partition( ip, ip + np_upp, next_axis, (const int (*)[2]) dbox, p_box ); } } }
void box_partition_rcb( const BoxType & root_box , std::vector<BoxType> & part_boxes ) { const BoxBoundsLinear use_boxes ; const size_t part_count = part_boxes.size(); box_partition( 0 , part_count , root_box , & part_boxes[0] ); // Verify partitioning size_t total_cell = 0 ; for ( size_t i = 0 ; i < part_count ; ++i ) { total_cell += count( part_boxes[i] ); BoxType box_interior , box_use ; use_boxes.apply( root_box , part_boxes[i] , box_interior , box_use ); if ( count( box_use ) < count( part_boxes[i] ) || count( part_boxes[i] ) < count( box_interior ) || part_boxes[i] != intersect( part_boxes[i] , box_use ) || box_interior != intersect( part_boxes[i] , box_interior )) { std::ostringstream msg ; msg << "box_partition_rcb ERROR : " << "part_boxes[" << i << "] = " << part_boxes[i] << " use " << box_use << " interior " << box_interior << std::endl << " part ^ use " << intersect( part_boxes[i] , box_use ) << " part ^ interior " << intersect( part_boxes[i] , box_interior ); throw std::runtime_error( msg.str() ); } for ( size_t j = i + 1 ; j < part_count ; ++j ) { const BoxType tmp = intersect( part_boxes[i] , part_boxes[j] ); if ( count( tmp ) ) { throw std::runtime_error( std::string("box partition intersection") ); } } } if ( total_cell != count( root_box ) ) { throw std::runtime_error( std::string("box partition count") ); } }
static void test_box( const int box[3][2] , const int np ) { const int ncell_box = box[0][1] * box[1][1] * box[2][1] ; int ncell_total = 0 ; int ncell_min = ncell_box ; int ncell_max = 0 ; int (*pbox)[3][2] ; int i , j ; pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 ); box_partition( 0 , np , 2 , box , pbox ); for ( i = 0 ; i < np ; ++i ) { const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) * ( pbox[i][1][1] - pbox[i][1][0] ) * ( pbox[i][2][1] - pbox[i][2][0] ); if ( ! box_contain( box , (const int (*)[2]) pbox[i] ) ) { fprintf(stdout," OUT OF BOUNDS pbox[%d/%d] = ",i,np); box_print(stdout,(const int (*)[2]) pbox[i]); fprintf(stdout,"\n"); abort(); } for ( j = i + 1 ; j < np ; ++j ) { if ( ! box_disjoint( (const int (*)[2]) pbox[i] , (const int (*)[2]) pbox[j] ) ) { fprintf(stdout," NOT DISJOINT pbox[%d/%d] = ",i,np); box_print(stdout, (const int (*)[2]) pbox[i]); fprintf(stdout,"\n"); fprintf(stdout," pbox[%d/%d] = ",j,np); box_print(stdout, (const int (*)[2]) pbox[j]); fprintf(stdout,"\n"); abort(); } } ncell_total += ncell ; if ( ncell_max < ncell ) { ncell_max = ncell ; } if ( ncell < ncell_min ) { ncell_min = ncell ; } } if ( ncell_total != ncell_box ) { fprintf(stdout," WRONG CELL COUNT NP = %d\n",np); abort(); } fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n", np,ncell_box,ncell_box/np,ncell_min,ncell_max); free( pbox ); }
void box_partition_rcb( const int np , const int my_p , const int root_box[][2] , const int ghost , int (**pbox)[3][2] , int ** map_local_id , int ** map_recv_pc , int ** map_send_pc , int ** map_send_id ) { *pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 ); box_partition( 0 , np , 2 , root_box , *pbox ); box_partition_maps( np , my_p , (const int (*)[3][2]) *pbox , ghost , map_local_id , map_recv_pc , map_send_pc , map_send_id ); }
int main(int argc, char** argv) { miniFE::Parameters params; miniFE::get_parameters(argc, argv, params); int numprocs = 1, myproc = 0; miniFE::initialize_mpi(argc, argv, numprocs, myproc); Kokkos::initialize(argc,argv); if(myproc==0) { std::cout << "MiniFE Mini-App, Kokkos Peer Implementation" << std::endl; } miniFE::timer_type start_time = miniFE::mytimer(); #ifdef MINIFE_DEBUG outstream(numprocs, myproc); #endif //make sure each processor has the same parameters: miniFE::broadcast_parameters(params); Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz }; std::vector<Box> local_boxes(numprocs); box_partition(0, numprocs, 2, global_box, &local_boxes[0]); Box& my_box = local_boxes[myproc]; MINIFE_GLOBAL_ORDINAL num_my_ids = miniFE::get_num_ids<MINIFE_GLOBAL_ORDINAL>(my_box); MINIFE_GLOBAL_ORDINAL min_ids = num_my_ids; #ifdef HAVE_MPI MPI_Datatype mpi_dtype = miniFE::TypeTraits<MINIFE_GLOBAL_ORDINAL>::mpi_type(); MPI_Allreduce(&num_my_ids, &min_ids, 1, mpi_dtype, MPI_MIN, MPI_COMM_WORLD); #endif if (min_ids == 0) { std::cout<<"One or more processors have 0 equations. Not currently supported. Exiting."<<std::endl; miniFE::finalize_mpi(); return 1; } std::ostringstream osstr; osstr << "miniFE." << params.nx << "x" << params.ny << "x" << params.nz; #ifdef HAVE_MPI osstr << ".P"<<numprocs; #endif osstr << "."; if (params.name != "") osstr << params.name << "."; YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str()); if (myproc == 0) { add_params_to_yaml(doc, params); add_configuration_to_yaml(doc, numprocs); add_timestring_to_yaml(doc); } //Most of the program is performed in the 'driver' function, which is //templated on < Scalar, LocalOrdinal, GlobalOrdinal >. //To run miniFE with float instead of double, or 'long long' instead of int, //etc., change these template-parameters by changing the macro definitions in //the makefile or on the make command-line. int return_code = miniFE::driver< MINIFE_SCALAR, MINIFE_LOCAL_ORDINAL, MINIFE_GLOBAL_ORDINAL>(global_box, my_box, params, doc); miniFE::timer_type total_time = miniFE::mytimer() - start_time; if (myproc == 0) { doc.add("Total Program Time",total_time); doc.generateYAML(); } Kokkos::finalize(); miniFE::finalize_mpi(); return return_code; }
void BoxFixture::generate_boxes( const BOX root_box, BOX local_box ) { const unsigned p_rank = m_bulk_data.parallel_rank(); const unsigned p_size = m_bulk_data.parallel_size(); const unsigned ngx = root_box[0][1] - root_box[0][0] ; const unsigned ngy = root_box[1][1] - root_box[1][0] ; BOX * const p_box = new BOX[ p_size ]; box_partition( 0 , p_size , 2 , root_box , & p_box[0] ); local_box[0][0] = p_box[ p_rank ][0][0] ; local_box[0][1] = p_box[ p_rank ][0][1] ; local_box[1][0] = p_box[ p_rank ][1][0] ; local_box[1][1] = p_box[ p_rank ][1][1] ; local_box[2][0] = p_box[ p_rank ][2][0] ; local_box[2][1] = p_box[ p_rank ][2][1] ; // Create elements: std::vector<unsigned> local_count ; const stk_classic::mesh::PartVector no_parts ; for ( int k = local_box[2][0] ; k < local_box[2][1] ; ++k ) { for ( int j = local_box[1][0] ; j < local_box[1][1] ; ++j ) { for ( int i = local_box[0][0] ; i < local_box[0][1] ; ++i ) { const EntityId n0= 1 + (i+0) + (j+0) * (ngx+1) + (k+0) * (ngx+1) * (ngy+1); const EntityId n1= 1 + (i+1) + (j+0) * (ngx+1) + (k+0) * (ngx+1) * (ngy+1); const EntityId n2= 1 + (i+1) + (j+1) * (ngx+1) + (k+0) * (ngx+1) * (ngy+1); const EntityId n3= 1 + (i+0) + (j+1) * (ngx+1) + (k+0) * (ngx+1) * (ngy+1); const EntityId n4= 1 + (i+0) + (j+0) * (ngx+1) + (k+1) * (ngx+1) * (ngy+1); const EntityId n5= 1 + (i+1) + (j+0) * (ngx+1) + (k+1) * (ngx+1) * (ngy+1); const EntityId n6= 1 + (i+1) + (j+1) * (ngx+1) + (k+1) * (ngx+1) * (ngy+1); const EntityId n7= 1 + (i+0) + (j+1) * (ngx+1) + (k+1) * (ngx+1) * (ngy+1); const EntityId elem_id = 1 + i + j * ngx + k * ngx * ngy; Entity & node0 = m_bulk_data.declare_entity( 0 , n0 , no_parts ); Entity & node1 = m_bulk_data.declare_entity( 0 , n1 , no_parts ); Entity & node2 = m_bulk_data.declare_entity( 0 , n2 , no_parts ); Entity & node3 = m_bulk_data.declare_entity( 0 , n3 , no_parts ); Entity & node4 = m_bulk_data.declare_entity( 0 , n4 , no_parts ); Entity & node5 = m_bulk_data.declare_entity( 0 , n5 , no_parts ); Entity & node6 = m_bulk_data.declare_entity( 0 , n6 , no_parts ); Entity & node7 = m_bulk_data.declare_entity( 0 , n7 , no_parts ); Entity & elem = m_bulk_data.declare_entity( 3 , elem_id , no_parts ); m_bulk_data.declare_relation( elem , node0 , 0 ); m_bulk_data.declare_relation( elem , node1 , 1 ); m_bulk_data.declare_relation( elem , node2 , 2 ); m_bulk_data.declare_relation( elem , node3 , 3 ); m_bulk_data.declare_relation( elem , node4 , 4 ); m_bulk_data.declare_relation( elem , node5 , 5 ); m_bulk_data.declare_relation( elem , node6 , 6 ); m_bulk_data.declare_relation( elem , node7 , 7 ); } } } delete[] p_box ; }
static void test_maps( const int root_box[][2] , const int np ) { const int ghost = 1 ; const int nx_global = root_box[0][1] - root_box[0][0] ; const int ny_global = root_box[1][1] - root_box[1][0] ; int ieq , i , j ; int (*pbox)[3][2] ; int **local_values ; int **map_local_id ; int **map_recv_pc ; int **map_send_pc ; int **map_send_id ; pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 ); box_partition( 0 , np , 2 , root_box , pbox ); local_values = (int **) malloc( sizeof(int*) * np ); map_local_id = (int **) malloc( sizeof(int*) * np ); map_recv_pc = (int **) malloc( sizeof(int*) * np ); map_send_pc = (int **) malloc( sizeof(int*) * np ); map_send_id = (int **) malloc( sizeof(int*) * np ); /* Set each local value to the global equation number */ for ( ieq = i = 0 ; i < np ; ++i ) { const int (*mybox)[2] = (const int (*)[2]) pbox[i] ; const int nx = mybox[0][1] - mybox[0][0] ; const int ny = mybox[1][1] - mybox[1][0] ; const int nz = mybox[2][1] - mybox[2][0] ; int ix , iy , iz ; /* Generate the partition maps for this rank */ box_partition_maps( np , i , (const int (*)[3][2]) pbox , ghost , & map_local_id[i] , & map_recv_pc[i] , & map_send_pc[i] , & map_send_id[i] ); local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] ); for ( iz = -ghost ; iz < nz + ghost ; ++iz ) { for ( iy = -ghost ; iy < ny + ghost ; ++iy ) { for ( ix = -ghost ; ix < nx + ghost ; ++ix ) { const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz); if ( 0 <= ieq ) { const int ix_global = ix + mybox[0][0] ; const int iy_global = iy + mybox[1][0] ; const int iz_global = iz + mybox[2][0] ; if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] && root_box[1][0] <= iy_global && iy_global < root_box[1][1] && root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) { local_values[i][ ieq ] = ix_global + iy_global * nx_global + iz_global * nx_global * ny_global ; } else { local_values[i][ ieq ] = -1 ; } } } } } } /* Pair-wise compare the local values */ /* i == receiving processor rank */ /* ip == sending processor rank */ /* j == receiving processor data entry for message from 'ip' */ /* jp == sending processor data entry for message to 'i' */ for ( i = 0 ; i < np ; ++i ) { for ( j = 1 ; j < np ; ++j ) { const int ip = ( i + j ) % np ; const int jp = ( i + np - ip ) % np ; const int nrecv = map_recv_pc[i] [j+1] - map_recv_pc[i] [j] ; const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ; int k ; if ( nrecv != nsend ) { fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip); fprintf(stderr,"P%d send %d to P%d\n",ip,nsend,i); abort(); } for ( k = 0 ; k < nrecv ; ++k ) { const int irecv = map_recv_pc[i][j] + k ; const int isend = map_send_pc[ip][jp] + k ; const int val_irecv = local_values[i][irecv] ; const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ; if ( val_irecv != val_isend ) { fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip); fprintf(stderr,"P%d send[%d] = %d , to P%d\n",ip,k,val_isend,i); abort(); } } } } for ( i = 0 ; i < np ; ++i ) { free( map_local_id[i] ); free( map_recv_pc[i] ); free( map_send_pc[i] ); free( map_send_id[i] ); free( local_values[i] ); } free( map_send_id ); free( map_send_pc ); free( map_recv_pc ); free( map_local_id ); free( local_values ); free( pbox ); }
void box_partition_rcb( const int np , const int root_box[3][2] , int pbox[][3][2] ) { box_partition( 0 , np , 2 , root_box , pbox ); }
int main(int argc, char** argv) { miniFE::Parameters params; miniFE::get_parameters(argc, argv, params); int numprocs = 1, myproc = 0; miniFE::initialize_mpi(argc, argv, numprocs, myproc); miniFE::timer_type start_time = miniFE::mytimer(); #ifdef MINIFE_DEBUG outstream(numprocs, myproc); #endif if(myproc==0) { std::cout << "MiniFE Mini-App, OpenMP Peer Implementation" << std::endl; std::cout << "Creating OpenMP Thread Pool..." << std::endl; } int value = 0; const int thread_count = omp_get_max_threads(); #pragma omp parallel for reduction(+:value) for(int i = 0; i < thread_count; ++i) { value += 1; } double global_threadcount; double local_threadcount = value; #ifdef HAVE_MPI MPI_Allreduce(&local_threadcount,&global_threadcount,1,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); #else global_threadcount = local_threadcount; #endif if(myproc==0) { std::cout << "Counted: " << global_threadcount << " threads." << std::endl; std::cout << "Running MiniFE Mini-App..." << std::endl; } //make sure each processor has the same parameters: miniFE::broadcast_parameters(params); Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz }; std::vector<Box> local_boxes(numprocs); box_partition(0, numprocs, 2, global_box, &local_boxes[0]); Box& my_box = local_boxes[myproc]; MINIFE_GLOBAL_ORDINAL num_my_ids = miniFE::get_num_ids<MINIFE_GLOBAL_ORDINAL>(my_box); MINIFE_GLOBAL_ORDINAL min_ids = num_my_ids; #ifdef HAVE_MPI MPI_Datatype mpi_dtype = miniFE::TypeTraits<MINIFE_GLOBAL_ORDINAL>::mpi_type(); MPI_Allreduce(&num_my_ids, &min_ids, 1, mpi_dtype, MPI_MIN, MPI_COMM_WORLD); #endif if (min_ids == 0) { std::cout<<"One or more processors have 0 equations. Not currently supported. Exiting."<<std::endl; miniFE::finalize_mpi(); return 1; } std::ostringstream osstr; osstr << "miniFE." << params.nx << "x" << params.ny << "x" << params.nz; #ifdef HAVE_MPI osstr << ".P" << numprocs; #endif #ifdef _OPENMP osstr << ".T" << omp_get_max_threads(); #endif osstr << "."; if (params.name != "") osstr << params.name << "."; YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str()); if (myproc == 0) { add_params_to_yaml(doc, params); add_configuration_to_yaml(doc, numprocs, params.numthreads); add_timestring_to_yaml(doc); } //Most of the program is performed in the 'driver' function, which is //templated on < Scalar, LocalOrdinal, GlobalOrdinal >. //To run miniFE with float instead of double, or 'long long' instead of int, //etc., change these template-parameters by changing the macro definitions in //the makefile or on the make command-line. int return_code = miniFE::driver< MINIFE_SCALAR, MINIFE_LOCAL_ORDINAL, MINIFE_GLOBAL_ORDINAL>(global_box, my_box, params, doc); miniFE::timer_type total_time = miniFE::mytimer() - start_time; #ifdef MINIFE_REPORT_RUSAGE struct rusage get_mem; getrusage(RUSAGE_SELF, &get_mem); long long int rank_rss = get_mem.ru_maxrss; long long int global_rss = 0; long long int max_rss = 0; #ifdef HAVE_MPI MPI_Reduce(&rank_rss, &global_rss, 1, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD); MPI_Reduce(&rank_rss, &max_rss, 1, MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD); if (myproc == 0) { doc.add("Global All-RSS (kB)", global_rss); doc.add("Global Max-RSS (kB)", max_rss); } #else doc.add("RSS (kB)", rank_rss); #endif #endif if (myproc == 0) { doc.add("Total Program Time",total_time); doc.generateYAML(); } miniFE::finalize_mpi(); return return_code; }