void run_rma_test(int nprocs_per_node) { int myrank, nprocs; int mem_rank; MPI_Win win; int *baseptr; MPI_Aint local_size; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (nprocs < nprocs_per_node * 2) { if (!myrank) printf("should start program with at least %d processes\n", nprocs_per_node * 2); MPI_Finalize(); exit(EXIT_FAILURE); } mem_rank = nprocs_per_node + nprocs_per_node / 2; local_size = (myrank == mem_rank) ? COUNT : 0; MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &win); MPI_Win_lock_all(0, win); int type_size; MPI_Type_size(MPI_INT, &type_size); size_t nbytes = COUNT * type_size; assert(MPI_Alloc_mem(nbytes, MPI_INFO_NULL, &baseptr) == MPI_SUCCESS); assert(MPI_Win_attach(win, baseptr, nbytes) == MPI_SUCCESS); MPI_Aint ldisp; MPI_Aint *disps = malloc(nprocs * sizeof(MPI_Aint)); assert(MPI_Get_address(baseptr, &ldisp) == MPI_SUCCESS); assert(MPI_Allgather(&ldisp, 1, MPI_AINT, disps, nprocs, MPI_AINT, MPI_COMM_WORLD) == MPI_SUCCESS); if (myrank == 0) { for (size_t idx = 0; idx < COUNT; ++idx) { baseptr[idx] = idx * COUNT + 1; } } MPI_Barrier(MPI_COMM_WORLD); if (myrank == mem_rank) { assert(MPI_Get(baseptr, 10, MPI_INT, 0, disps[0], 10, MPI_INT, win) == MPI_SUCCESS); assert(MPI_Win_flush(0, win) == MPI_SUCCESS); for (size_t idx = 0; idx < COUNT; ++idx) { assert(baseptr[idx] == idx * 10 + 1); } } MPI_Barrier(MPI_COMM_WORLD); MPI_Win_unlock_all(win); MPI_Barrier(MPI_COMM_WORLD); MPI_Win_free(&win); MPI_Free_mem(baseptr); printf("Test finished\n"); }
void declareBindings (void) { /* === Point-to-point === */ void* buf; int count; MPI_Datatype datatype; int dest; int tag; MPI_Comm comm; MPI_Send (buf, count, datatype, dest, tag, comm); // L12 int source; MPI_Status status; MPI_Recv (buf, count, datatype, source, tag, comm, &status); // L15 MPI_Get_count (&status, datatype, &count); MPI_Bsend (buf, count, datatype, dest, tag, comm); MPI_Ssend (buf, count, datatype, dest, tag, comm); MPI_Rsend (buf, count, datatype, dest, tag, comm); void* buffer; int size; MPI_Buffer_attach (buffer, size); // L22 MPI_Buffer_detach (buffer, &size); MPI_Request request; MPI_Isend (buf, count, datatype, dest, tag, comm, &request); // L25 MPI_Ibsend (buf, count, datatype, dest, tag, comm, &request); MPI_Issend (buf, count, datatype, dest, tag, comm, &request); MPI_Irsend (buf, count, datatype, dest, tag, comm, &request); MPI_Irecv (buf, count, datatype, source, tag, comm, &request); MPI_Wait (&request, &status); int flag; MPI_Test (&request, &flag, &status); // L32 MPI_Request_free (&request); MPI_Request* array_of_requests; int index; MPI_Waitany (count, array_of_requests, &index, &status); // L36 MPI_Testany (count, array_of_requests, &index, &flag, &status); MPI_Status* array_of_statuses; MPI_Waitall (count, array_of_requests, array_of_statuses); // L39 MPI_Testall (count, array_of_requests, &flag, array_of_statuses); int incount; int outcount; int* array_of_indices; MPI_Waitsome (incount, array_of_requests, &outcount, array_of_indices, array_of_statuses); // L44--45 MPI_Testsome (incount, array_of_requests, &outcount, array_of_indices, array_of_statuses); // L46--47 MPI_Iprobe (source, tag, comm, &flag, &status); // L48 MPI_Probe (source, tag, comm, &status); MPI_Cancel (&request); MPI_Test_cancelled (&status, &flag); MPI_Send_init (buf, count, datatype, dest, tag, comm, &request); MPI_Bsend_init (buf, count, datatype, dest, tag, comm, &request); MPI_Ssend_init (buf, count, datatype, dest, tag, comm, &request); MPI_Rsend_init (buf, count, datatype, dest, tag, comm, &request); MPI_Recv_init (buf, count, datatype, source, tag, comm, &request); MPI_Start (&request); MPI_Startall (count, array_of_requests); void* sendbuf; int sendcount; MPI_Datatype sendtype; int sendtag; void* recvbuf; int recvcount; MPI_Datatype recvtype; MPI_Datatype recvtag; MPI_Sendrecv (sendbuf, sendcount, sendtype, dest, sendtag, recvbuf, recvcount, recvtype, source, recvtag, comm, &status); // L67--69 MPI_Sendrecv_replace (buf, count, datatype, dest, sendtag, source, recvtag, comm, &status); // L70--71 MPI_Datatype oldtype; MPI_Datatype newtype; MPI_Type_contiguous (count, oldtype, &newtype); // L74 int blocklength; { int stride; MPI_Type_vector (count, blocklength, stride, oldtype, &newtype); // L78 } { MPI_Aint stride; MPI_Type_hvector (count, blocklength, stride, oldtype, &newtype); // L82 } int* array_of_blocklengths; { int* array_of_displacements; MPI_Type_indexed (count, array_of_blocklengths, array_of_displacements, oldtype, &newtype); // L87--88 } { MPI_Aint* array_of_displacements; MPI_Type_hindexed (count, array_of_blocklengths, array_of_displacements, oldtype, &newtype); // L92--93 MPI_Datatype* array_of_types; MPI_Type_struct (count, array_of_blocklengths, array_of_displacements, array_of_types, &newtype); // L95--96 } void* location; MPI_Aint address; MPI_Address (location, &address); // L100 MPI_Aint extent; MPI_Type_extent (datatype, &extent); // L102 MPI_Type_size (datatype, &size); MPI_Aint displacement; MPI_Type_lb (datatype, &displacement); // L105 MPI_Type_ub (datatype, &displacement); MPI_Type_commit (&datatype); MPI_Type_free (&datatype); MPI_Get_elements (&status, datatype, &count); void* inbuf; void* outbuf; int outsize; int position; MPI_Pack (inbuf, incount, datatype, outbuf, outsize, &position, comm); // L114 int insize; MPI_Unpack (inbuf, insize, &position, outbuf, outcount, datatype, comm); // L116--117 MPI_Pack_size (incount, datatype, comm, &size); /* === Collectives === */ MPI_Barrier (comm); // L121 int root; MPI_Bcast (buffer, count, datatype, root, comm); // L123 MPI_Gather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); // L124--125 int* recvcounts; int* displs; MPI_Gatherv (sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, root, comm); // L128--130 MPI_Scatter (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, root, comm); // L131--132 int* sendcounts; MPI_Scatterv (sendbuf, sendcounts, displs, sendtype, recvbuf, recvcount, recvtype, root, comm); // L134--135 MPI_Allgather (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); // L136--137 MPI_Allgatherv (sendbuf, sendcount, sendtype, recvbuf, recvcounts, displs, recvtype, comm); // L138--140 MPI_Alltoall (sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype, comm); // L141--142 int* sdispls; int* rdispls; MPI_Alltoallv (sendbuf, sendcounts, sdispls, sendtype, recvbuf, recvcounts, rdispls, recvtype, comm); // L145--147 MPI_Op op; MPI_Reduce (sendbuf, recvbuf, count, datatype, op, root, comm); // L149 #if 0 MPI_User_function function; int commute; MPI_Op_create (function, commute, &op); // L153 #endif MPI_Op_free (&op); // L155 MPI_Allreduce (sendbuf, recvbuf, count, datatype, op, comm); MPI_Reduce_scatter (sendbuf, recvbuf, recvcounts, datatype, op, comm); MPI_Scan (sendbuf, recvbuf, count, datatype, op, comm); /* === Groups, contexts, and communicators === */ MPI_Group group; MPI_Group_size (group, &size); // L162 int rank; MPI_Group_rank (group, &rank); // L164 MPI_Group group1; int n; int* ranks1; MPI_Group group2; int* ranks2; MPI_Group_translate_ranks (group1, n, ranks1, group2, ranks2); // L170 int result; MPI_Group_compare (group1, group2, &result); // L172 MPI_Group newgroup; MPI_Group_union (group1, group2, &newgroup); // L174 MPI_Group_intersection (group1, group2, &newgroup); MPI_Group_difference (group1, group2, &newgroup); int* ranks; MPI_Group_incl (group, n, ranks, &newgroup); // L178 MPI_Group_excl (group, n, ranks, &newgroup); extern int ranges[][3]; MPI_Group_range_incl (group, n, ranges, &newgroup); // L181 MPI_Group_range_excl (group, n, ranges, &newgroup); MPI_Group_free (&group); MPI_Comm_size (comm, &size); MPI_Comm_rank (comm, &rank); MPI_Comm comm1; MPI_Comm comm2; MPI_Comm_compare (comm1, comm2, &result); MPI_Comm newcomm; MPI_Comm_dup (comm, &newcomm); MPI_Comm_create (comm, group, &newcomm); int color; int key; MPI_Comm_split (comm, color, key, &newcomm); // L194 MPI_Comm_free (&comm); MPI_Comm_test_inter (comm, &flag); MPI_Comm_remote_size (comm, &size); MPI_Comm_remote_group (comm, &group); MPI_Comm local_comm; int local_leader; MPI_Comm peer_comm; int remote_leader; MPI_Comm newintercomm; MPI_Intercomm_create (local_comm, local_leader, peer_comm, remote_leader, tag, &newintercomm); // L204--205 MPI_Comm intercomm; MPI_Comm newintracomm; int high; MPI_Intercomm_merge (intercomm, high, &newintracomm); // L209 int keyval; #if 0 MPI_Copy_function copy_fn; MPI_Delete_function delete_fn; void* extra_state; MPI_Keyval_create (copy_fn, delete_fn, &keyval, extra_state); // L215 #endif MPI_Keyval_free (&keyval); // L217 void* attribute_val; MPI_Attr_put (comm, keyval, attribute_val); // L219 MPI_Attr_get (comm, keyval, attribute_val, &flag); MPI_Attr_delete (comm, keyval); /* === Environmental inquiry === */ char* name; int resultlen; MPI_Get_processor_name (name, &resultlen); // L226 MPI_Errhandler errhandler; #if 0 MPI_Handler_function function; MPI_Errhandler_create (function, &errhandler); // L230 #endif MPI_Errhandler_set (comm, errhandler); // L232 MPI_Errhandler_get (comm, &errhandler); MPI_Errhandler_free (&errhandler); int errorcode; char* string; MPI_Error_string (errorcode, string, &resultlen); // L237 int errorclass; MPI_Error_class (errorcode, &errorclass); // L239 MPI_Wtime (); MPI_Wtick (); int argc; char** argv; MPI_Init (&argc, &argv); // L244 MPI_Finalize (); MPI_Initialized (&flag); MPI_Abort (comm, errorcode); }
void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, MPI_Datatype datatype, int file_ptr_type, ADIO_Offset offset, ADIO_Status *status, int *error_code) { /* Uses a generalized version of the extended two-phase method described * in "An Extended Two-Phase Method for Accessing Sections of * Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, * Scientific Programming, (5)4:301--317, Winter 1996. * http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ ADIOI_Access *my_req; /* array of nprocs access structures, one for each other process has this process's request */ ADIOI_Access *others_req; /* array of nprocs access structures, one for each other process whose request is written by this process. */ int i, filetype_is_contig, nprocs, myrank, do_collect = 0; int contig_access_count = 0, buftype_is_contig, interleave_count = 0; int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ADIO_Offset orig_fp, start_offset, end_offset, off; ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; ADIO_Offset *len_list = NULL; int **buf_idx = NULL, *striping_info = NULL; int old_error, tmp_error; MPI_Comm_size(fd->comm, &nprocs); MPI_Comm_rank(fd->comm, &myrank); orig_fp = fd->fp_ind; /* IO patten identification if cb_write isn't disabled */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { /* For this process's request, calculate the list of offsets and lengths in the file and determine the start and end offsets. */ /* Note: end_offset points to the last byte-offset that will be accessed. * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ ADIOI_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, &offset_list, &len_list, &start_offset, &end_offset, &contig_access_count); /* each process communicates its start and end offsets to other * processes. The result is an array each of start and end offsets * stored in order of process rank. */ st_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); end_offsets = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); MPI_Allgather(&start_offset, 1, ADIO_OFFSET, st_offsets, 1, ADIO_OFFSET, fd->comm); MPI_Allgather(&end_offset, 1, ADIO_OFFSET, end_offsets, 1, ADIO_OFFSET, fd->comm); /* are the accesses of different processes interleaved? */ for (i = 1; i < nprocs; i++) if ((st_offsets[i] < end_offsets[i-1]) && (st_offsets[i] <= end_offsets[i])) interleave_count++; /* This is a rudimentary check for interleaving, but should suffice for the moment. */ /* Two typical access patterns can benefit from collective write. * 1) the processes are interleaved, and * 2) the req size is small. */ if (interleave_count > 0) { do_collect = 1; } else { do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, len_list, nprocs); } } ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); /* Decide if collective I/O should be done */ if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) || fd->hints->cb_write == ADIOI_HINT_DISABLE) { /* use independent accesses */ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); } fd->fp_ind = orig_fp; ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); if (buftype_is_contig && filetype_is_contig) { if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { off = fd->disp + (ADIO_Offset)(fd->etype_size) * offset; ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, off, status, error_code); } else ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, 0, status, error_code); } else { ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, offset, status, error_code); } return; } /* Get Lustre hints information */ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); /* calculate what portions of the access requests of this process are * located in which process */ ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count, striping_info, nprocs, &count_my_req_procs, &count_my_req_per_proc, &my_req, &buf_idx); /* based on everyone's my_req, calculate what requests of other processes * will be accessed by this process. * count_others_req_procs = number of processes whose requests (including * this process itself) will be accessed by this process * count_others_req_per_proc[i] indicates how many separate contiguous * requests of proc. i will be accessed by this process. */ ADIOI_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, my_req, nprocs, myrank, &count_others_req_procs, &others_req); ADIOI_Free(count_my_req_per_proc); /* exchange data and write in sizes of no more than stripe_size. */ ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank, others_req, my_req, offset_list, len_list, contig_access_count, striping_info, buf_idx, error_code); /* If this collective write is followed by an independent write, * it's possible to have those subsequent writes on other processes * race ahead and sneak in before the read-modify-write completes. * We carry out a collective communication at the end here so no one * can start independent i/o before collective I/O completes. * * need to do some gymnastics with the error codes so that if something * went wrong, all processes report error, but if a process has a more * specific error code, we can still have that process report the * additional information */ old_error = *error_code; if (*error_code != MPI_SUCCESS) *error_code = MPI_ERR_IO; /* optimization: if only one process performing i/o, we can perform * a less-expensive Bcast */ #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL); #endif if (fd->hints->cb_nodes == 1) MPI_Bcast(error_code, 1, MPI_INT, fd->hints->ranklist[0], fd->comm); else { tmp_error = *error_code; MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, MPI_MAX, fd->comm); } #ifdef ADIOI_MPE_LOGGING MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL); #endif if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO)) *error_code = old_error; if (!buftype_is_contig) ADIOI_Delete_flattened(datatype); /* free all memory allocated for collective I/O */ /* free others_req */ for (i = 0; i < nprocs; i++) { if (others_req[i].count) { ADIOI_Free(others_req[i].offsets); ADIOI_Free(others_req[i].lens); ADIOI_Free(others_req[i].mem_ptrs); } } ADIOI_Free(others_req); /* free my_req here */ for (i = 0; i < nprocs; i++) { if (my_req[i].count) { ADIOI_Free(my_req[i].offsets); ADIOI_Free(my_req[i].lens); } } ADIOI_Free(my_req); for (i = 0; i < nprocs; i++) { ADIOI_Free(buf_idx[i]); } ADIOI_Free(buf_idx); ADIOI_Free(offset_list); ADIOI_Free(len_list); ADIOI_Free(st_offsets); ADIOI_Free(end_offsets); ADIOI_Free(striping_info); #ifdef HAVE_STATUS_SET_BYTES if (status) { int bufsize, size; /* Don't set status if it isn't needed */ MPI_Type_size(datatype, &size); bufsize = size * count; MPIR_Status_set_bytes(status, datatype, bufsize); } /* This is a temporary way of filling in status. The right way is to * keep track of how much data was actually written during collective I/O. */ #endif fd->fp_sys_posn = -1; /* set it to null. */ }
void all_gather(const T& elem, std::vector<T>& results) { #ifdef HAS_MPI // Get the mpi rank and size size_t mpi_size(size()); if(results.size() != mpi_size) results.resize(mpi_size); // Serialize the local map graphlab::charstream cstrm(128); graphlab::oarchive oarc(cstrm); oarc << elem; cstrm.flush(); char* send_buffer = cstrm->c_str(); int send_buffer_size = (int)cstrm->size(); assert(send_buffer_size >= 0); // compute the sizes std::vector<int> recv_sizes(mpi_size, -1); // Compute the sizes int error = MPI_Allgather(&send_buffer_size, // Send buffer 1, // send count MPI_INT, // send type &(recv_sizes[0]), // recvbuffer 1, // recvcount MPI_INT, // recvtype MPI_COMM_WORLD); assert(error == MPI_SUCCESS); for(size_t i = 0; i < recv_sizes.size(); ++i) assert(recv_sizes[i] >= 0); // Construct offsets std::vector<int> recv_offsets(recv_sizes); int sum = 0, tmp = 0; for(size_t i = 0; i < recv_offsets.size(); ++i) { tmp = recv_offsets[i]; recv_offsets[i] = sum; sum += tmp; } // if necessary realloac recv_buffer std::vector<char> recv_buffer(sum); // recv all the maps error = MPI_Allgatherv(send_buffer, // send buffer send_buffer_size, // how much to send MPI_BYTE, // send type &(recv_buffer[0]), // recv buffer &(recv_sizes[0]), // amount to recv // for each cpuess &(recv_offsets[0]), // where to place data MPI_BYTE, MPI_COMM_WORLD); assert(error == MPI_SUCCESS); // Update the local map namespace bio = boost::iostreams; typedef bio::stream<bio::array_source> icharstream; icharstream strm(&(recv_buffer[0]), recv_buffer.size()); graphlab::iarchive iarc(strm); for(size_t i = 0; i < results.size(); ++i) { iarc >> results[i]; } #else logstream(LOG_FATAL) << "MPI not installed!" << std::endl; #endif } // end of mpi all gather
int uctSort(NimGameState rootState, int maximumIterations, bool useRanks) { int world_rank; MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); // Get the number of processes int world_size; MPI_Comm_size(MPI_COMM_WORLD, &world_size); MctsNode root(0, NULL, &rootState); for(int i=0; i<maximumIterations; i++) { MctsNode* node = &root; NimGameState state = rootState.clone(); // std::cout<< "Copy Repr " << node->representation() << std::endl; while(node->actionsNotTaken.empty() && !node->childNodes.empty()) { // std::cout<< "Entering Selection Step" << std::endl; node = node->selectNextChildNode(); state.performAction(node->previousAction); } if(!node->actionsNotTaken.empty()) { // std::cout<< "Entering Expansion Step" << std::endl; std::random_shuffle(node->actionsNotTaken.begin(), node->actionsNotTaken.end()); int action = node->actionsNotTaken.back(); // std::cout<< "Action: " << action << std::endl; state.performAction(action); // std::cout<< state.representation() << std::endl; node = node->addChildNode(action, &state); // std::cout<< node->representation() << std::endl; } while(!state.getAvailableActions().empty()) { // std::cout<< "Entering Simulation Step" << std::endl; std::random_shuffle(node->actionsNotTaken.begin(), node->actionsNotTaken.end()); int action = node->actionsNotTaken.back(); state.performAction(action); } while(node->parentNode != NULL) { // std::cout<< "Entering Backpropagation Step" << std::endl; node->update(state.getValue(node->lastActivePlayer)); node = node->parentNode; } } if(useRanks && world_size > 1) { // Root synchronization std::string serialized = MctsNodeSerializer::Serialize(root); serialized += "#"; if (world_rank == 0) { //std::cout << "Sending:" << serialized.length() << "/" << DEFAULT_MESSAGE_SIZE << std::endl; //std::cout << "Rank 0, root looks before merge: " << serialized << std::endl; } if (serialized.length() > DEFAULT_MESSAGE_SIZE) { std::cout << "Error:" << serialized.length() << "/" << DEFAULT_MESSAGE_SIZE << std::endl; std::cout << "Seralized tree is too big!" << std::endl; return -1; } if (serialized.length() < DEFAULT_MESSAGE_SIZE) { serialized.resize(DEFAULT_MESSAGE_SIZE, '@'); } char *rcv_buffer = new char[DEFAULT_MESSAGE_SIZE * world_size]; MPI_Allgather((void*)serialized.c_str(), DEFAULT_MESSAGE_SIZE, MPI::CHAR, rcv_buffer, DEFAULT_MESSAGE_SIZE, MPI::CHAR, MPI_COMM_WORLD); std::string received(rcv_buffer, (unsigned long) (DEFAULT_MESSAGE_SIZE * world_size)); delete rcv_buffer; //long ct = std::count(received.begin(), received.end(), '@'); received.erase(std::remove(received.begin(), received.end(), '@'), received.end()); std::stringstream receivedDataStream(received); //std::cout << "All data: " << received << std::endl; std::string serializedTree; while (std::getline(receivedDataStream, serializedTree, '#')) { serializedTree = serializedTree.substr(0, serializedTree.length() - 1); if (world_rank == 0) { //std::cout << "Rank 0, deserialized new tree before unboxing: " << serializedTree << std::endl; //std::cout << serialized.length() << std::endl; //std::cout << "There was " << ct << "/" << DEFAULT_MESSAGE_SIZE*world_size << "empty chars in data for iteration " << i << std::endl; //std::cout << "Attempting to deserialize data: '" << serializedTree << "'" << std::endl; //std::cout << "Attempting to deserialize data.. '" << std::endl; MctsNode remoteTree = MctsNodeDeserializer::Deserialize(serializedTree); //std::cout << "Rank 0, root before merge: " << MctsNodeSerializer::Serialize(root) << std::endl; //std::cout << "Rank 0, newTree before merge: " << MctsNodeSerializer::Serialize(root) << std::endl; //std::cout << "Data deserialized" << std::endl; //std::cout << "Attempting to merge" << std::endl; MctsTreeMerger::MergeTrees(&root, &remoteTree); std::cout << "Rank 0, after merge: " << MctsNodeSerializer::Serialize(root) << std::endl; //std::cout << "Merge is done" << std::endl; } else { MctsNode remoteTree = MctsNodeDeserializer::Deserialize(serializedTree); MctsTreeMerger::MergeTrees(&root, &remoteTree); } } } std::sort(root.childNodes.begin(), root.childNodes.end(), compareNodesByVisists); MctsNode lastItem = root.childNodes.back(); // if(world_rank == 0) // { // //std::cout << MctsNodeSerializer::Serialize(root) << std::endl; // for(std::vector<MctsNode>::iterator it = root.childNodes.begin(); it != root.childNodes.end(); ++it) // { // std::cout << it->representation() << std::endl; // } // } // std::cout<< "RootState @exit: "<< rootState.representation() << std::endl; return lastItem.previousAction; }
/* data1, odata1 and odata2 are packed in the format (for communication): data[0] = is_max, no of is data[1] = size of is[0] ... data[is_max] = size of is[is_max-1] data[is_max + 1] = data(is[0]) ... data[is_max+1+sum(size of is[k]), k=0,...,i-1] = data(is[i]) ... data2 is packed in the format (for creating output is[]): data[0] = is_max, no of is data[1] = size of is[0] ... data[is_max] = size of is[is_max-1] data[is_max + 1] = data(is[0]) ... data[is_max + 1 + Mbs*i) = data(is[i]) ... */ static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[]) { Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; PetscErrorCode ierr; PetscMPIInt size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len,*iwork; const PetscInt *idx_i; PetscInt idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i; PetscInt Mbs,i,j,k,*odata1,*odata2; PetscInt proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est; PetscInt proc_end=0,len_unused,nodata2; PetscInt ois_max; /* max no of is[] in each of processor */ char *t_p; MPI_Comm comm; MPI_Request *s_waits1,*s_waits2,r_req; MPI_Status *s_status,r_status; PetscBT *table; /* mark indices of this processor's is[] */ PetscBT table_i; PetscBT otable; /* mark indices of other processors' is[] */ PetscInt bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners; IS garray_local,garray_gl; PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)C,&comm); CHKERRQ(ierr); size = c->size; rank = c->rank; Mbs = c->Mbs; ierr = PetscObjectGetNewTag((PetscObject)C,&tag1); CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2); CHKERRQ(ierr); /* create tables used in step 1: table[i] - mark c->garray of proc [i] step 3: table[i] - mark indices of is[i] when whose=MINE table[0] - mark incideces of is[] when whose=OTHER */ len = PetscMax(is_max, size); CHKERRQ(ierr); ierr = PetscMalloc2(len,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,&t_p); CHKERRQ(ierr); for (i=0; i<len; i++) { table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; } ierr = MPIU_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm); CHKERRQ(ierr); /* 1. Send this processor's is[] to other processors */ /*---------------------------------------------------*/ /* allocate spaces */ ierr = PetscMalloc1(is_max,&n); CHKERRQ(ierr); len = 0; for (i=0; i<is_max; i++) { ierr = ISGetLocalSize(is[i],&n[i]); CHKERRQ(ierr); len += n[i]; } if (!len) { is_max = 0; } else { len += 1 + is_max; /* max length of data1 for one processor */ } ierr = PetscMalloc1(size*len+1,&data1); CHKERRQ(ierr); ierr = PetscMalloc1(size,&data1_start); CHKERRQ(ierr); for (i=0; i<size; i++) data1_start[i] = data1 + i*len; ierr = PetscMalloc4(size,&len_s,size,&btable,size,&iwork,size+1,&Bowners); CHKERRQ(ierr); /* gather c->garray from all processors */ ierr = ISCreateGeneral(comm,Bnbs,c->garray,PETSC_COPY_VALUES,&garray_local); CHKERRQ(ierr); ierr = ISAllGather(garray_local, &garray_gl); CHKERRQ(ierr); ierr = ISDestroy(&garray_local); CHKERRQ(ierr); ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm); CHKERRQ(ierr); Bowners[0] = 0; for (i=0; i<size; i++) Bowners[i+1] += Bowners[i]; if (is_max) { /* hash table ctable which maps c->row to proc_id) */ ierr = PetscMalloc1(Mbs,&ctable); CHKERRQ(ierr); for (proc_id=0,j=0; proc_id<size; proc_id++) { for (; j<C->rmap->range[proc_id+1]/bs; j++) ctable[j] = proc_id; } /* hash tables marking c->garray */ ierr = ISGetIndices(garray_gl,&idx_i); CHKERRQ(ierr); for (i=0; i<size; i++) { table_i = table[i]; ierr = PetscBTMemzero(Mbs,table_i); CHKERRQ(ierr); for (j = Bowners[i]; j<Bowners[i+1]; j++) { /* go through B cols of proc[i]*/ ierr = PetscBTSet(table_i,idx_i[j]); CHKERRQ(ierr); } } ierr = ISRestoreIndices(garray_gl,&idx_i); CHKERRQ(ierr); } /* if (is_max) */ ierr = ISDestroy(&garray_gl); CHKERRQ(ierr); /* evaluate communication - mesg to who, length, and buffer space */ for (i=0; i<size; i++) len_s[i] = 0; /* header of data1 */ for (proc_id=0; proc_id<size; proc_id++) { iwork[proc_id] = 0; *data1_start[proc_id] = is_max; data1_start[proc_id]++; for (j=0; j<is_max; j++) { if (proc_id == rank) { *data1_start[proc_id] = n[j]; } else { *data1_start[proc_id] = 0; } data1_start[proc_id]++; } } for (i=0; i<is_max; i++) { ierr = ISGetIndices(is[i],&idx_i); CHKERRQ(ierr); for (j=0; j<n[i]; j++) { idx = idx_i[j]; *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */ proc_end = ctable[idx]; for (proc_id=0; proc_id<=proc_end; proc_id++) { /* for others to process */ if (proc_id == rank) continue; /* done before this loop */ if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx)) continue; /* no need for sending idx to [proc_id] */ *data1_start[proc_id] = idx; data1_start[proc_id]++; len_s[proc_id]++; } } /* update header data */ for (proc_id=0; proc_id<size; proc_id++) { if (proc_id== rank) continue; *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id]; iwork[proc_id] = len_s[proc_id]; } ierr = ISRestoreIndices(is[i],&idx_i); CHKERRQ(ierr); } nrqs = 0; nrqr = 0; for (i=0; i<size; i++) { data1_start[i] = data1 + i*len; if (len_s[i]) { nrqs++; len_s[i] += 1 + is_max; /* add no. of header msg */ } } for (i=0; i<is_max; i++) { ierr = ISDestroy(&is[i]); CHKERRQ(ierr); } ierr = PetscFree(n); CHKERRQ(ierr); ierr = PetscFree(ctable); CHKERRQ(ierr); /* Determine the number of messages to expect, their lengths, from from-ids */ ierr = PetscGatherNumberOfMessages(comm,NULL,len_s,&nrqr); CHKERRQ(ierr); ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1); CHKERRQ(ierr); /* Now post the sends */ ierr = PetscMalloc2(size,&s_waits1,size,&s_waits2); CHKERRQ(ierr); k = 0; for (proc_id=0; proc_id<size; proc_id++) { /* send data1 to processor [proc_id] */ if (len_s[proc_id]) { ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k); CHKERRQ(ierr); k++; } } /* 2. Receive other's is[] and process. Then send back */ /*-----------------------------------------------------*/ len = 0; for (i=0; i<nrqr; i++) { if (len_r1[i] > len) len = len_r1[i]; } ierr = PetscFree(len_r1); CHKERRQ(ierr); ierr = PetscFree(id_r1); CHKERRQ(ierr); for (proc_id=0; proc_id<size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0; ierr = PetscMalloc1(len+1,&odata1); CHKERRQ(ierr); ierr = PetscMalloc1(size,&odata2_ptr); CHKERRQ(ierr); ierr = PetscBTCreate(Mbs,&otable); CHKERRQ(ierr); len_max = ois_max*(Mbs+1); /* max space storing all is[] for each receive */ len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */ ierr = PetscMalloc1(len_est+1,&odata2); CHKERRQ(ierr); nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */ odata2_ptr[nodata2] = odata2; len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */ k = 0; while (k < nrqr) { /* Receive messages */ ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status); CHKERRQ(ierr); if (flag) { ierr = MPI_Get_count(&r_status,MPIU_INT,&len); CHKERRQ(ierr); proc_id = r_status.MPI_SOURCE; ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req); CHKERRQ(ierr); ierr = MPI_Wait(&r_req,&r_status); CHKERRQ(ierr); /* Process messages */ /* make sure there is enough unused space in odata2 array */ if (len_unused < len_max) { /* allocate more space for odata2 */ ierr = PetscMalloc1(len_est+1,&odata2); CHKERRQ(ierr); odata2_ptr[++nodata2] = odata2; len_unused = len_est; } ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable); CHKERRQ(ierr); len = 1 + odata2[0]; for (i=0; i<odata2[0]; i++) len += odata2[1 + i]; /* Send messages back */ ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k); CHKERRQ(ierr); k++; odata2 += len; len_unused -= len; len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */ } } ierr = PetscFree(odata1); CHKERRQ(ierr); ierr = PetscBTDestroy(&otable); CHKERRQ(ierr); /* 3. Do local work on this processor's is[] */ /*-------------------------------------------*/ /* make sure there is enough unused space in odata2(=data) array */ len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */ if (len_unused < len_max) { /* allocate more space for odata2 */ ierr = PetscMalloc1(len_est+1,&odata2); CHKERRQ(ierr); odata2_ptr[++nodata2] = odata2; } data = odata2; ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table); CHKERRQ(ierr); ierr = PetscFree(data1_start); CHKERRQ(ierr); /* 4. Receive work done on other processors, then merge */ /*------------------------------------------------------*/ /* get max number of messages that this processor expects to recv */ ierr = MPIU_Allreduce(len_s,iwork,size,MPI_INT,MPI_MAX,comm); CHKERRQ(ierr); ierr = PetscMalloc1(iwork[rank]+1,&data2); CHKERRQ(ierr); ierr = PetscFree4(len_s,btable,iwork,Bowners); CHKERRQ(ierr); k = 0; while (k < nrqs) { /* Receive messages */ ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status); CHKERRQ(ierr); if (flag) { ierr = MPI_Get_count(&r_status,MPIU_INT,&len); CHKERRQ(ierr); proc_id = r_status.MPI_SOURCE; ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req); CHKERRQ(ierr); ierr = MPI_Wait(&r_req,&r_status); CHKERRQ(ierr); if (len > 1+is_max) { /* Add data2 into data */ data2_i = data2 + 1 + is_max; for (i=0; i<is_max; i++) { table_i = table[i]; data_i = data + 1 + is_max + Mbs*i; isz = data[1+i]; for (j=0; j<data2[1+i]; j++) { col = data2_i[j]; if (!PetscBTLookupSet(table_i,col)) data_i[isz++] = col; } data[1+i] = isz; if (i < is_max - 1) data2_i += data2[1+i]; } } k++; } } ierr = PetscFree(data2); CHKERRQ(ierr); ierr = PetscFree2(table,t_p); CHKERRQ(ierr); /* phase 1 sends are complete */ ierr = PetscMalloc1(size,&s_status); CHKERRQ(ierr); if (nrqs) { ierr = MPI_Waitall(nrqs,s_waits1,s_status); CHKERRQ(ierr); } ierr = PetscFree(data1); CHKERRQ(ierr); /* phase 2 sends are complete */ if (nrqr) { ierr = MPI_Waitall(nrqr,s_waits2,s_status); CHKERRQ(ierr); } ierr = PetscFree2(s_waits1,s_waits2); CHKERRQ(ierr); ierr = PetscFree(s_status); CHKERRQ(ierr); /* 5. Create new is[] */ /*--------------------*/ for (i=0; i<is_max; i++) { data_i = data + 1 + is_max + Mbs*i; ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,PETSC_COPY_VALUES,is+i); CHKERRQ(ierr); } for (k=0; k<=nodata2; k++) { ierr = PetscFree(odata2_ptr[k]); CHKERRQ(ierr); } ierr = PetscFree(odata2_ptr); CHKERRQ(ierr); PetscFunctionReturn(0); }
/*----------------------------------------------------------------------------- * Function: Parallel_daypx * Purpose: scalar a * y + x = y and gathers the complete solution vector * onto each processor * In args: a, loc_x, loc_y, y, n, comm * Out args: loc_y, y */ void Parallel_daypx(double a, double loc_x[], double loc_y[], double y[], int n, MPI_Comm comm) { daypx(a, loc_x, loc_y, n); MPI_Allgather(loc_y, n, MPI_DOUBLE, y, n, MPI_DOUBLE, comm); } /* Parallel_daypx */
/* communicate integers and doubles using point to point communication */ int COM (MPI_Comm comm, int tag, COMDATA *send, int nsend, COMDATA **recv, int *nrecv) /* recv is contiguous => free (*recv) releases all memory */ { COMDATA *cd; int rank, ncpu, send_size, (*send_sizes) [3], *send_position, *send_rank, send_count, *send_rank_all, *send_count_all, *send_rank_disp, *recv_rank, (*recv_sizes) [3], recv_count, i, j, k, l; char **send_data, **recv_data; MPI_Request *req; MPI_Status *sta; void *p; MPI_Comm_rank (comm, &rank); MPI_Comm_size (comm, &ncpu); ERRMEM (send_sizes = MEM_CALLOC (ncpu * sizeof (int [3]))); ERRMEM (send_position = MEM_CALLOC (ncpu * sizeof (int))); ERRMEM (send_rank = malloc (ncpu * sizeof (int))); ERRMEM (send_data = malloc (ncpu * sizeof (char*))); /* compute send sizes */ for (i = 0, cd = send; i < nsend; i ++, cd ++) { send_sizes [cd->rank][0] += cd->ints; send_sizes [cd->rank][1] += cd->doubles; MPI_Pack_size (cd->ints, MPI_INT, comm, &j); MPI_Pack_size (cd->doubles, MPI_DOUBLE, comm, &k); send_sizes [cd->rank][2] += (j + k); } /* allocate send buffers */ for (send_size = i = 0; i < ncpu; i ++) { if (send_sizes [i][2]) { ERRMEM (send_data [i] = malloc (send_sizes [i][2])); send_position [i] = 0; send_size += send_sizes [i][2]; } } /* pack ints */ for (i = 0, cd = send; i < nsend; i ++, cd ++) { if (cd->ints) { MPI_Pack (cd->i, cd->ints, MPI_INT, send_data [cd->rank], send_sizes [cd->rank][2], &send_position [cd->rank], comm); } } /* pack doubles */ for (i = 0, cd = send; i < nsend; i ++, cd ++) { if (cd->doubles) { MPI_Pack (cd->d, cd->doubles, MPI_DOUBLE, send_data [cd->rank], send_sizes [cd->rank][2], &send_position [cd->rank], comm); } } #if DEBUG for (i = 0; i < ncpu; i ++) { ASSERT_DEBUG (send_position [i] <= send_sizes [i][2], "Incorrect packing"); } #endif /* compute send ranks and move data */ for (send_count = i = 0; i < ncpu; i ++) { if (send_sizes [i][2]) { send_rank [send_count] = i; send_data [send_count] = send_data [i]; send_sizes [send_count][0] = send_sizes [i][0]; send_sizes [send_count][1] = send_sizes [i][1]; send_sizes [send_count][2] = send_sizes [i][2]; send_count ++; } } ERRMEM (send_count_all = malloc (ncpu * sizeof (int))); ERRMEM (recv_rank = malloc (ncpu * sizeof (int))); /* gather all send ranks */ MPI_Allgather (&send_count, 1, MPI_INT, send_count_all, 1, MPI_INT, comm); ERRMEM (send_rank_disp = malloc (ncpu * sizeof (int))); for (send_rank_disp [0] = l = i = 0; i < ncpu; i ++) { l += send_count_all [i]; if (i < ncpu-1) send_rank_disp [i+1] = l; } ERRMEM (send_rank_all = malloc (l * sizeof (int))); MPI_Allgatherv (send_rank, send_count, MPI_INT, send_rank_all, send_count_all, send_rank_disp, MPI_INT, comm); /* compute receive ranks */ for (recv_count = k = i = 0; i < l; i += send_count_all [k], k ++) { for (j = 0; j < send_count_all [k]; j ++) { if (send_rank_all [i+j] == rank) /* 'k'th rank is sending here */ { recv_rank [recv_count] = k; recv_count ++; break; } } } ERRMEM (recv_sizes = malloc (recv_count * sizeof (int [3]))); ERRMEM (req = malloc (recv_count * sizeof (MPI_Request))); ERRMEM (sta = malloc (recv_count * sizeof (MPI_Status))); /* communicate receive sizes */ for (i = 0; i < recv_count; i ++) { MPI_Irecv (recv_sizes [i], 3, MPI_INT, recv_rank [i], tag, comm, &req [i]); } MPI_Barrier (comm); for (i = 0; i < send_count; i ++) { MPI_Rsend (send_sizes [i], 3, MPI_INT, send_rank [i], tag, comm); } MPI_Waitall (recv_count, req, sta); /* contiguous receive size */ j = recv_count * sizeof (COMDATA); for (i = 0; i < recv_count; i ++) { j += recv_sizes [i][0] * sizeof (int) + recv_sizes [i][1] * sizeof (double); } /* prepare receive buffers */ ERRMEM (recv_data = malloc (recv_count * sizeof (char*))); ERRMEM ((*recv) = malloc (j)); p = (*recv) + recv_count; *nrecv = recv_count; for (i = 0, cd = *recv; i < recv_count; i ++, cd ++) { cd->rank = recv_rank [i]; cd->ints = recv_sizes [i][0]; cd->doubles = recv_sizes [i][1]; cd->i = p; p = (cd->i + cd->ints); cd->d = p; p = (cd->d + cd->doubles); ERRMEM (recv_data [i] = malloc (recv_sizes [i][2])); } /* communicate data */ for (i = 0; i < recv_count; i ++) { MPI_Irecv (recv_data [i], recv_sizes [i][2], MPI_PACKED, recv_rank [i], tag, comm, &req [i]); } MPI_Barrier (comm); for (i = 0; i < send_count; i ++) { MPI_Rsend (send_data [i], send_sizes [i][2], MPI_PACKED, send_rank [i], tag, comm); } MPI_Waitall (recv_count, req, sta); /* unpack data */ for (i = j = 0; i < recv_count; i ++, j = 0) { MPI_Unpack (recv_data [i], recv_sizes [i][2], &j, (*recv) [i].i, (*recv) [i].ints, MPI_INT, comm); MPI_Unpack (recv_data [i], recv_sizes [i][2], &j, (*recv) [i].d, (*recv) [i].doubles, MPI_DOUBLE, comm); } /* cleanup */ free (send_rank_disp); free (send_sizes); free (send_position); free (send_rank); for (i = 0; i < send_count; i ++) free (send_data [i]); free (send_data); free (send_count_all); free (send_rank_all); free (recv_rank); free (recv_sizes); for (i = 0; i < recv_count; i ++) free (recv_data [i]); free (recv_data); free (req); free (sta); return send_size; }
int main(int argc, char *argv[]) { int i,j,n; double **mat, *x, **mymat, *myx; double sum,temp,diff,bb; double e; int iter=0; int proc_num, my_rank; MPI_Init(&argc, &argv); // get the number of procs and rank in the comm MPI_Comm_size(MPI_COMM_WORLD, &proc_num); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); double start_time, end_time, total_time; MPI_Status status; if(my_rank == 0) { // Proc 0 is in charge of reading matrix and distribute data if (argc != 3) { printf("Usage: mpirun -np 4 hw1_3b_mpi <filename> <error>\n"); return 1; } printf("\nInput File: %s\n", argv[1]); e = (double)atof(argv[2]); printf("error= %f\n", e); /* Opening the input file */ FILE *fp = fopen(argv[1], "r"); if (fp == NULL) { printf("Error in opening a file: %s", argv[1]); return 0; } /* Reading the maxtrix dimension */ fscanf(fp, "%d",&n); printf("n= %d\n", n); // store in row major mat = (double**) malloc( n * sizeof(double*)); for (i= 0; i<n; i++) mat[i]= (double*) malloc((n+1) * sizeof(double)); /* Reading the input matrix */ for(i=0; i<n; i++) { for(j=0; j<n+1; j++) { fscanf(fp, "%lf", &mat[i][j]); } } fclose(fp); } /* Solving the given matrix iteratively */ /* if(my_rank == 1){ int dowait = 1; while(dowait){ ; } } */ // Broadcast matrix dimension 'n' and convergence 'e' MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&e, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); int myrows = n / proc_num; // mymat store's each proc's share of matrix columns mymat = (double**) malloc( myrows * sizeof(double*)); for (i= 0; i<myrows; i++) mymat[i]= (double*) malloc((n+1) * sizeof(double)); myx = (double*) malloc((myrows) * sizeof(double)); x = (double*) malloc((n) * sizeof(double)); for(i=0; i<myrows; i++) { myx[i]=0.0; } // start time, total time should include distributing the data // to other processes as part of the parallization start_time = MPI_Wtime(); // make every proc has myrows rows of the mat if(my_rank == 0){ int dest = 0; for(i = myrows; i < n; i++){ dest = i / myrows; MPI_Send(&mat[i][0], n + 1, MPI_DOUBLE, dest, i, MPI_COMM_WORLD); } for(i = 0; i < myrows; i++){ for(j = 0; j < n + 1; j++) mymat[i][j] = mat[i][j]; } } else{ for(i = 0; i < myrows; i++){ MPI_Recv(&mymat[i][0], n + 1, MPI_DOUBLE, 0, my_rank * myrows + i, MPI_COMM_WORLD, &status); } } iter=0; double allbb; double compute_time = MPI_Wtime(); do { bb=0; // all proc get all x MPI_Allgather(myx, myrows, MPI_DOUBLE, x, myrows, MPI_DOUBLE, MPI_COMM_WORLD); for(i=0;i<myrows;i++){ sum=0; for(j=0;j<n;j++){ if(j!=i+myrows*my_rank){ sum=sum+mymat[i][j]*x[j]; } } temp=(mymat[i][n]-sum) / mymat[i][i+myrows*my_rank]; diff=fabs(x[i]-temp); if(diff>bb){ bb=diff; } myx[i]=temp; } // each process get same bb value so all can go out of loop MPI_Allreduce( &bb, &allbb, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD ); iter++; } while(allbb>=e); // gather final x for print MPI_Allgather(myx, myrows, MPI_DOUBLE, x, myrows, MPI_DOUBLE, MPI_COMM_WORLD); if(my_rank ==0 ){ // record end time of computation end_time = MPI_Wtime(); total_time = end_time - start_time; printf("Total time:%lf; Computation time is:%lf\n", total_time, end_time - compute_time); } #if DEBUG /* prints the solution */ printf("\nAnswer >>", i, x[i]); for(i=0; i<n; i++) { printf("\nx[%d]=%f", i, x[i]); } printf("\n"); #endif if(my_rank == 0) { print_x(iter, n, x); printf("\ndone\n"); } // free allocated memory for (i=0; i<myrows; i++) { free(mymat[i]); } free(mymat); free(myx); free(x); if(my_rank == 0) { for (i=0; i<n; i++) { free(mat[i]); } free(mat); } MPI_Finalize(); return 0; }
/* * Compute the aggregator-related parameters that are required in 2-phase * collective IO of ADIO. * The parameters are * . the number of aggregators (proxies) : fd->hints->cb_nodes * . the ranks of the aggregators : fd->hints->ranklist * If MP_IONODEFILE is defined, POE determines all tasks on every node listed * in the node file and defines MP_IOTASKLIST with them, making them all * aggregators. Alternatively, the user can explictly set MP_IOTASKLIST * themselves. The format of the MP_IOTASKLIST is a colon-delimited list of * task ids, the first entry being the total number of aggregators, for example * to specify 4 aggregators on task ids 0,8,16,24 the value would be: * 4:0:8:16:24. If there is no MP_IONODEFILE, or MP_IOTASKLIST, then the * default aggregator selection is 1 task per node for every node of the job - * additionally, an environment variable MP_IOAGGR_CNT can be specified, which * defines the total number of aggregators, spread evenly across all the nodes. * The romio_cb_nodes and romio_cb_config_list hint user settings are ignored. */ int ADIOI_PE_gen_agg_ranklist(ADIO_File fd) { int numAggs = 0; char *ioTaskList = getenv( "MP_IOTASKLIST" ); char *ioAggrCount = getenv("MP_IOAGGR_CNT"); int i,j; int inTERcommFlag = 0; int myRank,commSize; MPI_Comm_rank(fd->comm, &myRank); MPI_Comm_size(fd->comm, &commSize); MPI_Comm_test_inter(fd->comm, &inTERcommFlag); if (inTERcommFlag) { FPRINTF(stderr,"ERROR: ATTENTION: inTERcomms are not supported in MPI-IO - aborting....\n"); perror("ADIOI_PE_gen_agg_ranklist:"); MPI_Abort(MPI_COMM_WORLD, 1); } if (ioTaskList) { int ioTaskListLen = strlen(ioTaskList); int ioTaskListPos = 0; char tmpBuf[8]; /* Big enough for 1M tasks (7 digits task ID). */ tmpBuf[7] = '\0'; for (i=0; i<7; i++) { tmpBuf[i] = *ioTaskList++; /* Maximum is 7 digits for 1 million. */ ioTaskListPos++; if (*ioTaskList == ':') { /* If the next char is a ':' ends it. */ tmpBuf[i+1] = '\0'; break; } } numAggs = atoi(tmpBuf); if (numAggs == 0) FPRINTF(stderr,"ERROR: ATTENTION: Number of aggregators specified in MP_IOTASKLIST set at 0 - default aggregator selection will be used.\n"); else if (!((numAggs > 0 ) && (numAggs <= commSize))) { FPRINTF(stderr,"ERROR: ATTENTION: The number of aggregators (%s) specified in MP_IOTASKLIST is outside the communicator task range of %d.\n",tmpBuf,commSize); numAggs = commSize; } fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int)); int aggIndex = 0; while (aggIndex < numAggs) { ioTaskList++; /* Advance past the ':' */ ioTaskListPos++; int allDigits=1; for (i=0; i<7; i++) { if (*ioTaskList < '0' || *ioTaskList > '9') allDigits=0; tmpBuf[i] = *ioTaskList++; ioTaskListPos++; if ( (*ioTaskList == ':') || (*ioTaskList == '\0') ) { tmpBuf[i+1] = '\0'; break; } } if (allDigits) { int newAggRank = atoi(tmpBuf); if (!((newAggRank >= 0 ) && (newAggRank < commSize))) { FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not within the communicator task range of 0 to %d - it will be ignored.\n",tmpBuf,commSize-1); } else { int aggAlreadyAdded = 0; for (i=0;i<aggIndex;i++) if (fd->hints->ranklist[i] == newAggRank) { aggAlreadyAdded = 1; break; } if (!aggAlreadyAdded) fd->hints->ranklist[aggIndex++] = newAggRank; else FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%d' is specified multiple times in MP_IOTASKLIST - duplicates are ignored.\n",newAggRank); } } else { FPRINTF(stderr,"ERROR: ATTENTION: The aggregator '%s' specified in MP_IOTASKLIST is not a valid integer task id - it will be ignored.\n",tmpBuf); } /* At the end check whether the list is shorter than specified. */ if (ioTaskListPos == ioTaskListLen) { if (aggIndex == 0) { FPRINTF(stderr,"ERROR: ATTENTION: No aggregators were correctly specified in MP_IOTASKLIST - default aggregator selection will be used.\n"); ADIOI_Free(fd->hints->ranklist); } else if (aggIndex < numAggs) FPRINTF(stderr,"ERROR: ATTENTION: %d aggregators were specified in MP_IOTASKLIST but only %d were correctly specified - setting the number of aggregators to %d.\n",numAggs, aggIndex,aggIndex); numAggs = aggIndex; } } } if (numAggs == 0) { MPID_Comm *mpidCommData; MPID_Comm_get_ptr(fd->comm,mpidCommData); int localSize = mpidCommData->local_size; // get my node rank int myNodeRank = mpidCommData->intranode_table[mpidCommData->rank]; int *allNodeRanks = (int *) ADIOI_Malloc (localSize * sizeof(int)); allNodeRanks[myRank] = myNodeRank; MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, allNodeRanks, 1, MPI_INT, fd->comm); #ifdef AGG_DEBUG printf("MPID_Comm data: local_size is %d\nintranode_table entries:\n",mpidCommData->local_size); for (i=0;i<localSize;i++) { printf("%d ",mpidCommData->intranode_table[i]); } printf("\ninternode_table entries:\n"); for (i=0;i<localSize;i++) { printf("%d ",mpidCommData->internode_table[i]); } printf("\n"); printf("\nallNodeRanks entries:\n"); for (i=0;i<localSize;i++) { printf("%d ",allNodeRanks[i]); } printf("\n"); #endif if (ioAggrCount) { int cntType = -1; if ( strcasecmp(ioAggrCount, "ALL") ) { if ( (cntType = atoi(ioAggrCount)) <= 0 ) { /* Input is other non-digit or less than 1 the assume */ /* 1 aggregator per node. Note: atoi(-1) reutns -1. */ /* No warning message given here -- done earlier. */ cntType = -1; } } else { /* ALL is specified set aggr count to localSize */ cntType = -2; } switch(cntType) { case -1: /* 1 aggr/node case */ { int rankListIndex = 0; fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int)); for (i=0;i<localSize;i++) { if (allNodeRanks[i] == 0) { fd->hints->ranklist[rankListIndex++] = i; numAggs++; } } } break; case -2: /* ALL tasks case */ fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int)); for (i=0;i<localSize;i++) { fd->hints->ranklist[i] = i; numAggs++; } break; default: /* Specific aggr count case -- MUST be less than localSize, otherwise set to localSize */ if (cntType > localSize) cntType = localSize; numAggs = cntType; // Round-robin thru allNodeRanks - pick the 0's, then the 1's, etc int currentNodeRank = 0; // node rank currently being selected as aggregator int rankListIndex = 0; int currentAllNodeIndex = 0; fd->hints->ranklist = (int *) ADIOI_Malloc (numAggs * sizeof(int)); while (rankListIndex < numAggs) { int foundEntry = 0; while (!foundEntry && (currentAllNodeIndex < localSize)) { if (allNodeRanks[currentAllNodeIndex] == currentNodeRank) { fd->hints->ranklist[rankListIndex++] = currentAllNodeIndex; foundEntry = 1; } currentAllNodeIndex++; } if (!foundEntry) { currentNodeRank++; currentAllNodeIndex = 0; } } // while break; } // switch(cntType) } // if (ioAggrCount) else { // default is 1 aggregator per node // take the 0 entries from allNodeRanks int rankListIndex = 0; fd->hints->ranklist = (int *) ADIOI_Malloc (localSize * sizeof(int)); for (i=0;i<localSize;i++) { if (allNodeRanks[i] == 0) { fd->hints->ranklist[rankListIndex++] = i; numAggs++; } } } ADIOI_Free(allNodeRanks); } if ( getenv("MP_I_SHOW_AGGRS") ) { if (myRank == 0) { printf("Agg rank list of %d generated:\n", numAggs); for (i=0;i<numAggs;i++) { printf("%d ",fd->hints->ranklist[i]); } printf("\n"); } } fd->hints->cb_nodes = numAggs; return 0; }
/* create a repetitive point to point communication pattern; * ranks and sizes must not change during the communication; * pointers to send and receive buffers data must not change */ void* COM_Pattern (MPI_Comm comm, int tag, COMDATA *send, int nsend, COMDATA **recv, int *nrecv) /* recv is contiguous => free (*recv) releases all memory */ { COMPATTERN *pattern; COMDATA *cd; int rank, ncpu, *send_rank_all, *send_count_all, *send_rank_disp, i, j, k, l; void *p; MPI_Comm_rank (comm, &rank); MPI_Comm_size (comm, &ncpu); ERRMEM (pattern = malloc (sizeof (COMPATTERN))); ERRMEM (pattern->rankmap = MEM_CALLOC (ncpu * sizeof (int))); ERRMEM (pattern->send_sizes = MEM_CALLOC (ncpu * sizeof (int [3]))); ERRMEM (pattern->send_position = MEM_CALLOC (ncpu * sizeof (int))); ERRMEM (pattern->send_rank = malloc (ncpu * sizeof (int))); ERRMEM (pattern->send_data = malloc (ncpu * sizeof (char*))); pattern->nsend = nsend; pattern->send = send; pattern->comm = comm; pattern->tag = tag; /* compute send sizes */ for (i = 0, cd = send; i < nsend; i ++, cd ++) { pattern->send_sizes [cd->rank][0] += cd->ints; pattern->send_sizes [cd->rank][1] += cd->doubles; MPI_Pack_size (cd->ints, MPI_INT, comm, &j); MPI_Pack_size (cd->doubles, MPI_DOUBLE, comm, &k); pattern->send_sizes [cd->rank][2] += (j + k); } /* allocate send buffers and prepare rank map */ for (pattern->send_size = i = j = 0; i < ncpu; i ++) { if (pattern->send_sizes [i][2]) { ERRMEM (pattern->send_data [i] = malloc (pattern->send_sizes [i][2])); pattern->rankmap [i] = j; pattern->send_size += pattern->send_sizes [i][2]; j ++; } } /* compute send ranks and move data */ for (pattern->send_count = i = 0; i < ncpu; i ++) { if (pattern->send_sizes [i][2]) { pattern->send_rank [pattern->send_count] = i; pattern->send_data [pattern->send_count] = pattern->send_data [i]; pattern->send_sizes [pattern->send_count][0] = pattern->send_sizes [i][0]; pattern->send_sizes [pattern->send_count][1] = pattern->send_sizes [i][1]; pattern->send_sizes [pattern->send_count][2] = pattern->send_sizes [i][2]; pattern->send_count ++; } } ERRMEM (send_count_all = malloc (ncpu * sizeof (int))); ERRMEM (pattern->recv_rank = malloc (ncpu * sizeof (int))); /* gather all send ranks */ MPI_Allgather (&pattern->send_count, 1, MPI_INT, send_count_all, 1, MPI_INT, comm); ERRMEM (send_rank_disp = malloc (ncpu * sizeof (int))); for (send_rank_disp [0] = l = i = 0; i < ncpu; i ++) { l += send_count_all [i]; if (i < ncpu-1) send_rank_disp [i+1] = l; } ERRMEM (send_rank_all = malloc (l * sizeof (int))); MPI_Allgatherv (pattern->send_rank, pattern->send_count, MPI_INT, send_rank_all, send_count_all, send_rank_disp, MPI_INT, comm); /* compute receive ranks */ for (pattern->recv_count = k = i = 0; i < l; i += send_count_all [k], k ++) { for (j = 0; j < send_count_all [k]; j ++) { if (send_rank_all [i+j] == rank) /* 'k'th rank is sending here */ { pattern->recv_rank [pattern->recv_count] = k; pattern->recv_count ++; break; } } } ERRMEM (pattern->recv_sizes = malloc (pattern->recv_count * sizeof (int [3]))); ERRMEM (pattern->recv_req = malloc (pattern->recv_count * sizeof (MPI_Request))); ERRMEM (pattern->recv_sta = malloc (pattern->recv_count * sizeof (MPI_Status))); ERRMEM (pattern->send_req = malloc (pattern->send_count * sizeof (MPI_Request))); ERRMEM (pattern->send_sta = malloc (pattern->send_count * sizeof (MPI_Status))); /* communicate receive sizes */ for (i = 0; i < pattern->recv_count; i ++) { MPI_Irecv (pattern->recv_sizes [i], 3, MPI_INT, pattern->recv_rank [i], tag, comm, &pattern->recv_req [i]); } MPI_Barrier (comm); for (i = 0; i < pattern->send_count; i ++) { MPI_Rsend (pattern->send_sizes [i], 3, MPI_INT, pattern->send_rank [i], tag, comm); } MPI_Waitall (pattern->recv_count, pattern->recv_req, pattern->recv_sta); /* contiguous receive size */ j = pattern->recv_count * sizeof (COMDATA); for (i = 0; i < pattern->recv_count; i ++) { j += pattern->recv_sizes [i][0] * sizeof (int) + pattern->recv_sizes [i][1] * sizeof (double); } /* prepare receive buffers */ ERRMEM (pattern->recv_data = malloc (pattern->recv_count * sizeof (char*))); ERRMEM (pattern->recv = malloc (j)); p = pattern->recv + pattern->recv_count; pattern->nrecv = pattern->recv_count; for (i = 0, cd = pattern->recv; i < pattern->recv_count; i ++, cd ++) { cd->rank = pattern->recv_rank [i]; cd->ints = pattern->recv_sizes [i][0]; cd->doubles = pattern->recv_sizes [i][1]; cd->i = p; p = (cd->i + cd->ints); cd->d = p; p = (cd->d + cd->doubles); ERRMEM (pattern->recv_data [i] = malloc (pattern->recv_sizes [i][2])); } /* truncate */ if (pattern->send_count) { ERRMEM (pattern->send_sizes = realloc (pattern->send_sizes, pattern->send_count * sizeof (int [3]))); ERRMEM (pattern->send_position = realloc (pattern->send_position, pattern->send_count * sizeof (int))); ERRMEM (pattern->send_rank = realloc (pattern->send_rank, pattern->send_count * sizeof (int))); ERRMEM (pattern->send_data = realloc (pattern->send_data, pattern->send_count * sizeof (char*))); } if (pattern->recv_count) ERRMEM (pattern->recv_rank = realloc (pattern->recv_rank, pattern->recv_count * sizeof (int))); /* cleanup */ free (send_rank_disp); free (send_count_all); free (send_rank_all); /* output */ *nrecv = pattern->nrecv; *recv = pattern->recv; return pattern; }
void splitSources(std::vector<double>& expandSources, std::vector<double>& directSources, std::vector<ot::TreeNode>& fgtList, std::vector<double>& sources, const unsigned int minPtsInFgt, const unsigned int FgtLev, MPI_Comm comm) { PetscLogEventBegin(splitSourcesEvent, 0, 0, 0, 0); int numPts = ((sources.size())/4); #ifdef DEBUG assert(!(sources.empty())); assert(fgtList.empty()); #endif { unsigned int px = static_cast<unsigned int>(sources[0]*(__DTPMD__)); unsigned int py = static_cast<unsigned int>(sources[1]*(__DTPMD__)); unsigned int pz = static_cast<unsigned int>(sources[2]*(__DTPMD__)); ot::TreeNode ptOct(px, py, pz, __MAX_DEPTH__, __DIM__, __MAX_DEPTH__); ot::TreeNode newFgt = ptOct.getAncestor(FgtLev); fgtList.push_back(newFgt); } for(int i = 1; i < numPts; ++i) { unsigned int px = static_cast<unsigned int>(sources[4*i]*(__DTPMD__)); unsigned int py = static_cast<unsigned int>(sources[(4*i)+1]*(__DTPMD__)); unsigned int pz = static_cast<unsigned int>(sources[(4*i)+2]*(__DTPMD__)); ot::TreeNode ptOct(px, py, pz, __MAX_DEPTH__, __DIM__, __MAX_DEPTH__); ot::TreeNode newFgt = ptOct.getAncestor(FgtLev); if(fgtList[fgtList.size() - 1] == newFgt) { fgtList[fgtList.size() - 1].addWeight(1); } else { fgtList.push_back(newFgt); } }//end for i #ifdef DEBUG assert(!(fgtList.empty())); #endif int rank; int npes; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &npes); int localFlag = 0; if( (rank > 0) && (rank < (npes - 1)) && ((fgtList.size()) == 1) ) { localFlag = 1; } int globalFlag; MPI_Allreduce(&localFlag, &globalFlag, 1, MPI_INT, MPI_SUM, comm); int prevRank = rank - 1; int nextRank = rank + 1; if(globalFlag > 0) { int gatherSendBuf = 0; if( (rank > 0) && (rank < (npes - 1)) && (fgtList.size() == 1) ) { gatherSendBuf = sources.size(); } int* gatherList = new int[npes]; MPI_Allgather((&gatherSendBuf), 1, MPI_INT, gatherList, 1, MPI_INT, comm); if(rank > 0) { while(gatherList[prevRank] > 0) { --prevRank; }//end while } if(rank < (npes - 1)) { while(gatherList[nextRank] > 0) { ++nextRank; }//end while } int* sendFgtCnts = new int[npes]; int* recvFgtCnts = new int[npes]; int* sendSourceCnts = new int[npes]; int* recvSourceCnts = new int[npes]; for(int i = 0; i < npes; ++i) { sendFgtCnts[i] = 0; recvFgtCnts[i] = 0; sendSourceCnts[i] = 0; recvSourceCnts[i] = 0; }//end i if(gatherSendBuf > 0) { sendFgtCnts[prevRank] = 1; sendSourceCnts[prevRank] = gatherSendBuf; } for(int i = rank + 1; i < nextRank; ++i) { recvFgtCnts[i] = 1; recvSourceCnts[i] = gatherList[i]; }//end i delete [] gatherList; int* sendFgtDisps = new int[npes]; int* recvFgtDisps = new int[npes]; sendFgtDisps[0] = 0; recvFgtDisps[0] = 0; for(int i = 1; i < npes; ++i) { sendFgtDisps[i] = sendFgtDisps[i - 1] + sendFgtCnts[i - 1]; recvFgtDisps[i] = recvFgtDisps[i - 1] + recvFgtCnts[i - 1]; }//end i std::vector<ot::TreeNode> tmpFgtList(recvFgtDisps[npes - 1] + recvFgtCnts[npes - 1]); ot::TreeNode* recvFgtBuf = NULL; if(!(tmpFgtList.empty())) { recvFgtBuf = (&(tmpFgtList[0])); } MPI_Alltoallv( (&(fgtList[0])), sendFgtCnts, sendFgtDisps, par::Mpi_datatype<ot::TreeNode>::value(), recvFgtBuf, recvFgtCnts, recvFgtDisps, par::Mpi_datatype<ot::TreeNode>::value(), comm); if(gatherSendBuf > 0) { fgtList.clear(); } else { for(int i = 0; i < tmpFgtList.size(); ++i) { if(tmpFgtList[i] == fgtList[fgtList.size() - 1]) { fgtList[fgtList.size() - 1].addWeight(tmpFgtList[i].getWeight()); } else { fgtList.push_back(tmpFgtList[i]); } }//end i } delete [] sendFgtCnts; delete [] recvFgtCnts; delete [] sendFgtDisps; delete [] recvFgtDisps; int* sendSourceDisps = new int[npes]; int* recvSourceDisps = new int[npes]; sendSourceDisps[0] = 0; recvSourceDisps[0] = 0; for(int i = 1; i < npes; ++i) { sendSourceDisps[i] = sendSourceDisps[i - 1] + sendSourceCnts[i - 1]; recvSourceDisps[i] = recvSourceDisps[i - 1] + recvSourceCnts[i - 1]; }//end i std::vector<double> tmpSources(recvSourceDisps[npes - 1] + recvSourceCnts[npes - 1]); double* recvSourceBuf = NULL; if(!(tmpSources.empty())) { recvSourceBuf = (&(tmpSources[0])); } MPI_Alltoallv( (&(sources[0])), sendSourceCnts, sendSourceDisps, MPI_DOUBLE, recvSourceBuf, recvSourceCnts, recvSourceDisps, MPI_DOUBLE, comm); if(gatherSendBuf > 0) { sources.clear(); } else { if(!(tmpSources.empty())) { sources.insert(sources.end(), tmpSources.begin(), tmpSources.end()); } } delete [] sendSourceCnts; delete [] recvSourceCnts; delete [] sendSourceDisps; delete [] recvSourceDisps; } if(!(fgtList.empty())) { ot::TreeNode prevFgt; ot::TreeNode nextFgt; ot::TreeNode firstFgt = fgtList[0]; ot::TreeNode lastFgt = fgtList[fgtList.size() - 1]; MPI_Request recvPrevReq; MPI_Request recvNextReq; MPI_Request sendFirstReq; MPI_Request sendLastReq; if(rank > 0) { MPI_Irecv(&prevFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(), prevRank, 1, comm, &recvPrevReq); MPI_Isend(&firstFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(), prevRank, 2, comm, &sendFirstReq); } if(rank < (npes - 1)) { MPI_Irecv(&nextFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(), nextRank, 2, comm, &recvNextReq); MPI_Isend(&lastFgt, 1, par::Mpi_datatype<ot::TreeNode>::value(), nextRank, 1, comm, &sendLastReq); } if(rank > 0) { MPI_Status status; MPI_Wait(&recvPrevReq, &status); MPI_Wait(&sendFirstReq, &status); } if(rank < (npes - 1)) { MPI_Status status; MPI_Wait(&recvNextReq, &status); MPI_Wait(&sendLastReq, &status); } bool removeFirst = false; bool addToLast = false; if(rank > 0) { if(prevFgt == firstFgt) { removeFirst = true; } } if(rank < (npes - 1)) { if(nextFgt == lastFgt) { addToLast = true; } } MPI_Request recvPtsReq; if(addToLast) { numPts = ((sources.size())/4); sources.resize(4*(numPts + (nextFgt.getWeight()))); fgtList[fgtList.size() - 1].addWeight(nextFgt.getWeight()); MPI_Irecv((&(sources[4*numPts])), (4*(nextFgt.getWeight())), MPI_DOUBLE, nextRank, 3, comm, &recvPtsReq); } if(removeFirst) { MPI_Send((&(sources[0])), (4*(firstFgt.getWeight())), MPI_DOUBLE, prevRank, 3, comm); fgtList.erase(fgtList.begin()); } if(addToLast) { MPI_Status status; MPI_Wait(&recvPtsReq, &status); } if(removeFirst) { sources.erase(sources.begin(), sources.begin() + (4*(firstFgt.getWeight()))); } } #ifdef DEBUG assert(expandSources.empty()); assert(directSources.empty()); #endif std::vector<ot::TreeNode> dummyList; int sourceIdx = 0; for(size_t i = 0; i < fgtList.size(); ++i) { if((fgtList[i].getWeight()) < minPtsInFgt) { directSources.insert(directSources.end(), (sources.begin() + sourceIdx), (sources.begin() + sourceIdx + (4*(fgtList[i].getWeight())))); } else { dummyList.push_back(fgtList[i]); expandSources.insert(expandSources.end(), (sources.begin() + sourceIdx), (sources.begin() + sourceIdx + (4*(fgtList[i].getWeight())))); } sourceIdx += (4*(fgtList[i].getWeight())); }//end i swap(dummyList, fgtList); #ifdef DEBUG assert((sources.size()) == ((directSources.size()) + (expandSources.size()))); #endif PetscLogEventEnd(splitSourcesEvent, 0, 0, 0, 0); }
template <class T> PCrsMatrix<U>::PCrsMatrix(const CrsMatrix<T>& s, const MPI_Comm ncomm) { mpi_init(ncomm); int iamsender=(s.rows()>0?1:0); //! we rely on the fact that only one node has this cond. fulfilled. // now we let everyone know who is the sender std::valarray<int> slist(mysize); MPI_Allgather(&iamsender, 1, MPI_INTEGER, &slist[0], 1, MPI_INTEGER, mycomm); int sender=-1; for (int i=0; i< mysize; ++i) if (slist[i]==1) if(sender==-1) sender=i; else ERROR("More than one process qualified as sender!"); if (sender==-1) ERROR("No process qualified as sender!"); // now we get the matrix size and resize it. typename PCrsMatrix<U>::index_type dim[2]; if (iamsender) { dim[0]=s.rows(); dim[1]=s.cols(); } MPI_Bcast(dim,2,mpi_index,sender,mycomm); resize(dim[0],dim[1]); // now we copy the options, as if it were an array of char... MPI_Bcast(&(const_cast<CrsMatrix<T> &>(s).pops),sizeof(pops),MPI_CHAR,sender,mycomm); setops(s.pops); // now we can send out the row indices to the nodes. unsigned long nmyrows=nroots[myrank+1]-nroots[myrank]; MPI_Request rreq; MPI_Irecv(&pmat.rpoints[0],nmyrows+1,mpi_index,sender,101,mycomm,&rreq); if (iamsender) { for (int i=0; i<mysize; ++i) MPI_Send(&(const_cast<CrsMatrix<T> &>(s).rpoints[nroots[i]]),nroots[i+1]-nroots[i]+1,mpi_index,i,101,mycomm); }; //wait for receive MPI_Status rstatus; MPI_Wait(&rreq, &rstatus); //then shift the indices as necessary, since we are getting chunks of data for (typename PCrsMatrix<U>::index_type i=1;i<=nmyrows; ++i) pmat.rpoints[i]-=pmat.rpoints[0]; pmat.rpoints[0]=0; pmat.presize(pmat.rpoints[nmyrows]); //very well. now we can share the column indices and the data! MPI_Request rreq_i, rreq_d; MPI_Irecv(&pmat.indices[0],pmat.rpoints[nmyrows],mpi_index,sender,202,mycomm,&rreq_i); if (iamsender) { for (int i=0; i<mysize; ++i) MPI_Send(&(const_cast<CrsMatrix<T> &>(s).indices[s.rpoints[nroots[i]]]), s.rpoints[nroots[i+1]]-s.rpoints[nroots[i]],mpi_index,i,202,mycomm); }; MPI_Irecv(&pmat.values[0],pmat.rpoints[nmyrows],mpi_data,sender,303,mycomm,&rreq_d); if (iamsender) { for (int i=0; i<mysize; ++i) MPI_Send(&(const_cast<CrsMatrix<T> &>(s).values[s.rpoints[nroots[i]]]), s.rpoints[nroots[i+1]]-s.rpoints[nroots[i]],mpi_data,i,303,mycomm); }; MPI_Wait(&rreq_i, &rstatus); MPI_Wait(&rreq_d, &rstatus); }
/*------------------------------------------------*/ int main (int argc, char **argv) { int cols, rows, iter, particles, x, y; int *pic; PartStr *p, *changes, *totalChanges; int rank, num, i, numChanges, numTotalChanges; int *changesPerNode, *buffDispl; MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &num); if (argc < 2) // use default values if user does not specify anything { cols = PIC_SIZE + 2; rows = PIC_SIZE + 2; iter = MAX_ITER; particles = PARTICLES; } else { cols = atoi (argv[1]) + 2; rows = atoi (argv[2]) + 2; particles = atoi (argv[3]); iter = atoi (argv[4]); } // initialize the random number generator srand(rank); // srand(time(0)); // this should be used instead if the program runs on multiple hosts int particlesPerNode = particles / num; if (rank == num - 1) particlesPerNode = particles - particlesPerNode * (num - 1); // in case particles cannot be split evenly // printf("%i has %i\n", rank, particlesPerNode); pic = (int *) malloc (sizeof (int) * cols * rows); p = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode); changes = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode); totalChanges = (PartStr *) malloc (sizeof (PartStr) * particlesPerNode); changesPerNode = (int *) malloc (sizeof (int) * num); buffDispl = (int *) malloc (sizeof (int) * num); assert (pic != 0 && p != 0 && changes != 0 && totalChanges != 0 && changesPerNode != 0); // MPI user type declaration int lengths[2] = { 1, 1 }; MPI_Datatype types[2] = { MPI_INT, MPI_INT }; MPI_Aint add1, add2; MPI_Aint displ[2]; MPI_Datatype Point; MPI_Address (p, &add1); MPI_Address (&(p[0].y), &add2); displ[0] = 0; displ[1] = add2 - add1; MPI_Type_struct (2, lengths, displ, types, &Point); MPI_Type_commit (&Point); dla_init_plist (pic, rows, cols, p, particlesPerNode, 1); while (--iter) { dla_evolve_plist (pic, rows, cols, p, &particlesPerNode, changes, &numChanges); // printf("%i changed %i on iter %i : ",rank, numChanges, iter); // for(i=0;i<numChanges;i++) printf("(%i, %i) ", changes[i].x, changes[i].y); // printf("\n"); //exchange information with other nodes MPI_Allgather (&numChanges, 1, MPI_INT, changesPerNode, 1, MPI_INT, MPI_COMM_WORLD); //calculate offsets numTotalChanges = 0; for (i = 0; i < num; i++) { buffDispl[i] = numTotalChanges; numTotalChanges += changesPerNode[i]; } // if(rank==0) // { // for(i=0;i<num;i++) // printf("%i tries to send %i\n",i,changesPerNode[i]); // printf("-----------\n"); // } if(numTotalChanges >0) { MPI_Allgatherv (changes, numChanges, Point, totalChanges, changesPerNode, buffDispl, Point, MPI_COMM_WORLD); apply_changes (pic, rows, cols, totalChanges, numTotalChanges); // if(rank==0) // { // printf("Total changes %i : ", numTotalChanges); // for(i=0;i<numTotalChanges;i++) printf("(%i, %i) ", totalChanges[i].x, totalChanges[i].y); // // printf("\n"); // printf("-----------\n"); // } } } /* Print to stdout a PBM picture of the simulation space */ if (rank == 0) { printf ("P1\n%i %i\n", cols - 2, rows - 2); for (y = 1; y < rows - 1; y++) { for (x = 1; x < cols - 1; x++) { if (pic[y * cols + x] < 0) printf ("1 "); else printf ("0 "); } printf ("\n"); } } MPI_Reduce(&particlesPerNode, &particles, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if(rank==0) fprintf(stderr, "Remaining particles %i\n", particles); free (pic); free (p); free (changes); free (changesPerNode); free (buffDispl); MPI_Finalize (); return 0; }
int MeshData::getNodesWithKnownValues(pMesh theMesh){ pEntity node, edge, face; int i,ID; double coord1[3],coord2[3],x1,y1,x2,y2; // search for flagged nodes VIter vit = M_vertexIter( theMesh ); while ( (node = VIter_next(vit)) ){ int flag = (!node->getClassification())?0:GEN_tag(node->getClassification()); if ( !pSimPar->isNodeFree(flag) ){ ID = get_AppToPETSc_Ordering(EN_id(node)); dirichlet[ID] = pSimPar->getBC_Value(flag); } } VIter_delete(vit); // search for flagged edges EIter eit = M_edgeIter( theMesh ); while ( (edge = EIter_next(eit)) ){ if (!theMesh->getRefinementDepth(edge)){ int flag = EN_getFlag(edge); /* * This is a way to use the simulator to evaluate elliptic equation without screw-up the input data procedure. */ V_coord(edge->get(0,0),coord1); x1 = coord1[0]; y1 = coord1[1]; V_coord(edge->get(0,1),coord2); x2 = coord2[0]; y2 = coord2[1]; #ifdef CRUMPTON_EXAMPLE if (flag >= 2000 && flag <3000 && flag != 2005 ){ // take only external boundary edges // boundary conditions: // u(x,y) = [2*sin(y)+cos(y)]alpha*x + sin(y), x <= 0 // exp(x)cos(y), x > 0 ID = get_AppToPETSc_Ordering(EN_id(edge->get(0,0))); dirichlet[ID] = (x1 <= .0)?((2.*sin(y1)+cos(y1))*ALPHA*x1 + sin(y1)):exp(x1)*sin(y1); ID = get_AppToPETSc_Ordering(EN_id(edge->get(0,1))); dirichlet[ID] = (x2 <= .0)?((2.*sin(y2)+cos(y2))*ALPHA*x2 + sin(y2)):exp(x2)*sin(y2); } #else if ( !pSimPar->isNodeFree(flag) ) for (i=0; i<2; i++){ ID = get_AppToPETSc_Ordering(EN_id(edge->get(0,i))); dirichlet[ID] = pSimPar->getBC_Value(flag); } #endif } } EIter_delete(eit); // search for flagged faces (on boundary only) // ------------------------------------------------------- if (theMesh->getDim()==3){ // External boundary condition defined // ------------------------------------------------------- if (pSimPar->SimulationHas_BC_ExternalDefinition()){ double coords[3], x, y, z; FIter fit = M_faceIter( theMesh ); while ( (face = FIter_next(fit)) ){ if (!theMesh->getRefinementDepth(face)){ int flag = EN_getFlag(face); if ( !pSimPar->isNodeFree(flag) ){ for (i=0; i<3; i++){ ID = EN_id(face->get(0,i)); pVertex v = (pVertex)theMesh->getVertex(ID); ID = get_AppToPETSc_Ordering(ID); V_coord(v,coords); x = coords[0]; y = coords[1]; z = coords[2]; dirichlet[ID] = pSimPar->exact_solution(x,y,z); } } } } FIter_delete(fit); } else{ // conventional (Dirichlet) boundary condition: specified in numeric.dat // -------------------------------------------------------------------- FIter fit = M_faceIter( theMesh ); while ( (face = FIter_next(fit)) ){ if (!theMesh->getRefinementDepth(face)){ int flag = EN_getFlag(face); if ( !pSimPar->isNodeFree(flag) ){ for (i=0; i<3; i++){ ID = get_AppToPETSc_Ordering(EN_id(face->get(0,i))); dirichlet[ID] = pSimPar->getBC_Value(flag); } } } } FIter_delete(fit); } } if (!P_getSumInt(dirichlet.size()) ){ throw Exception(__LINE__,__FILE__,"Prescribed (dirichlet) nodes were not found. Simulation cannot proceed.\n"); } // go ahead only if parallel if (P_size()==1) return 0; // now, all partitions must know all prescribed nodes and their flags // first of all, let partitions aware of how many prescribed nodes exist on each one // if processor p does not have any prescribed node let nLPN=1 because p cannot send 0 element int nLPN = dirichlet.size(); int *recvLP = new int[P_size()]; MPI_Allgather ( &nLPN, 1, MPI_INT, recvLP, 1, MPI_INT, MPI_COMM_WORLD ); // number of global prescribed nodes // Note that nGPN value is not necessary the real global prescribed nodes // Nodes on partition boundary can be counted twice or more int nGPN=0; for (i=0; i<P_size(); i++) nGPN += recvLP[i]; // sPIDs = send prescribed IDs sPFlags = send prescribed flags i=0; int *sPIDs = new int[nLPN]; double *sPFlags = new double[nLPN]; for(MIter mit = dirichlet.begin(); mit != dirichlet.end(); mit++){ sPIDs[i] = mit->first; sPFlags[i] = mit->second; i++; } // rcount says how many values each rank will send int *rcounts = recvLP; // displs says where to start to read in recv buffer int *displs = new int[P_size()]; displs[0] = 0; for (i=1; i<P_size(); i++) displs[i] = displs[i-1]+recvLP[i-1]; int *rPIDs = new int[nGPN]; // get all prescribed nodes MPI_Allgatherv(sPIDs,nLPN,MPI_INT,rPIDs,rcounts,displs,MPI_INT,MPI_COMM_WORLD); double *rPFlags = new double[nGPN]; // get flags from all prescribed nodes MPI_Allgatherv(sPFlags,nLPN,MPI_DOUBLE,rPFlags,rcounts,displs,MPI_DOUBLE,MPI_COMM_WORLD); for (i=0; i<nGPN; i++) dirichlet.insert( pair<int,double>(rPIDs[i],rPFlags[i]) ); delete[] sPIDs; sPIDs=0; delete[] sPFlags; sPFlags=0; delete[] rPIDs; rPIDs=0; delete[] rcounts; rcounts=0; delete[] displs; displs=0; return 0; }
int main (int argc, char **argv) { int mpi_rank, mpi_size; int n, ln; double *A, *b, *xA, *xB, *xC, *r, *X, *x; double lmax, max; struct timeval before, after; MPI_Init (&argc, &argv); MPI_Comm_size (MPI_COMM_WORLD, &mpi_size); MPI_Comm_rank (MPI_COMM_WORLD, &mpi_rank); /* Get the argument that indicates the problem size */ n = JACOBI_DEF_SIZE; if (argc == 2) n = atoi (argv[1]); if (mpi_rank == ROOT_NODE) fprintf (stdout, "n = %d\n", n); ln = n / mpi_size; /* Initialize the random seed */ /*srandom((unsigned int) getpid ());*/ /* Allocate memory */ A = (double *) calloc (ln * n, sizeof(double)); if (A == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } b = (double *) calloc (ln, sizeof(double)); if (b == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } xA = (double *) calloc (ln, sizeof(double)); if (xA == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } xB = (double *) calloc (ln, sizeof(double)); if (xB == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } xC = (double *) calloc (ln, sizeof(double)); if (xC == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } X = (double *) calloc (n, sizeof(double)); if (X == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } r = (double *) calloc (ln, sizeof(double)); if (r == NULL) { perror ("calloc"); MPI_Finalize (); return EXIT_FAILURE; } generate_matrix (A, ln, n, mpi_rank); generate_vector (b, ln); gettimeofday(&before, NULL); x = jacobi (xA, xB, xC, A, b, ln, n, mpi_rank, mpi_size); gettimeofday(&after, NULL); MPI_Allgather (x, ln, MPI_DOUBLE, X, ln, MPI_DOUBLE, MPI_COMM_WORLD); /* Compute the residual */ compute_residual (r, A, X, b, ln, n); /* Compute the maximum absolute value of the residual */ lmax = find_max_abs (r, ln); MPI_Reduce (&lmax, &max, 1, MPI_DOUBLE, MPI_MAX, ROOT_NODE, MPI_COMM_WORLD); if (mpi_rank == ROOT_NODE) display_info (A, x, b, r, n, max, &before, &after); /* Free the memory */ free (A); free (b); free (xA); free (xB); free (xC); free (X); free (r); /* Return success */ MPI_Finalize (); return 0; }
void MeshData::reorderVerticesIds(pMesh theMesh, int (*pFunc_numRemoteCopies)(pEntity)){ // rank p must take all remote nodes set<int> remoteNodesSet; set<int>::iterator Iter; int numUniqueNodes = 0; int i = 0; pEntity node; VIter vit = M_vertexIter(theMesh); while ( (node=VIter_next(vit)) ){ // if ( pFunc_numRemoteCopies(node) ){ // remoteNodesSet.insert(EN_id(node)); // } // else{ numUniqueNodes++; // } } i = 0; int numRemoteNodes = remoteNodesSet.size(); int *remoteNodes = new int[numRemoteNodes]; // transfer nodes from set to an array for (Iter=remoteNodesSet.begin(); Iter != remoteNodesSet.end(); Iter++) remoteNodes[i++] = *Iter; // rank must know how many nodes will receive from all other ranks int *numAllRemoteNodes = new int[P_size()]; MPI_Allgather(&numRemoteNodes,1,MPI_INT,numAllRemoteNodes,1,MPI_INT,MPI_COMM_WORLD); int total = 0; for (i=0; i<P_size(); i++) total += numAllRemoteNodes[i]; int *allRemoteNodes = new int[total]; int *displs = new int[P_size()]; displs[0] = 0; for (i=1; i<P_size(); i++) displs[i] = displs[i-1] + numAllRemoteNodes[i-1]; // receive remote nodes from all other ranks MPI_Allgatherv (remoteNodes,numRemoteNodes,MPI_INT,allRemoteNodes,numAllRemoteNodes,displs,MPI_INT,MPI_COMM_WORLD); delete[] displs; remoteNodesSet.clear(); // how many nodes from allRemoteNodes will be checked by rank p total = 0; for (i=0;i<P_pid(); i++) total += numAllRemoteNodes[i]; delete[] numAllRemoteNodes; // transfer to a set container only those remote nodes from allRemoteNodes from ranks // lower than rank p for (i=0;i<total;i++) remoteNodesSet.insert(allRemoteNodes[i]); delete[] allRemoteNodes; // duplicated numbers in different ranks must be avoided to no disturb PETSc! // each rank will check which remote nodes from rank p belong also to ranks lower than it // store on another set container those nodes that do not belong to ranks lower that rank p set<int> remoteNodesLowSet; for (i=0;i<numRemoteNodes;i++){ Iter = remoteNodesSet.find(remoteNodes[i]); if ( Iter == remoteNodesSet.end()) remoteNodesLowSet.insert(remoteNodes[i]); } remoteNodesSet.clear(); delete[] remoteNodes; // N means how many nodes rank p should map. int N = numUniqueNodes + remoteNodesLowSet.size(); // rank 0: mapping = 1,2,3,...,N0 // rank 1: mapping = N0+1,N0+2,N0+3,...,N0+N1 // rank 2: mapping = N0+N1+1,NO+N1+2,NO+N1+3,...,N0+N1+N2 this->numGN = P_getSumInt(N); // each rank should know all Ni's int *recvNs = new int[P_size()]; MPI_Allgather(&N,1,MPI_INT,recvNs,1,MPI_INT,MPI_COMM_WORLD); // start mapping int *apOrdering = new int[N]; int *petscOrdering = new int[N]; int from=1, ID, j=0; for (i=0; i<P_pid(); i++) from += recvNs[i]; i = from; vit = M_vertexIter(theMesh); while ( (node=VIter_next(vit)) ){ ID = EN_id(node); // if ( pFunc_numRemoteCopies(node) ){ // Iter = remoteNodesLowSet.find( ID ); // if ( Iter != remoteNodesLowSet.end() ){ // apOrdering[j] = ID; // petscOrdering[j] = i++; // j++; // } // } // else{ apOrdering[j] = ID; petscOrdering[j] = i++; j++; // } } remoteNodesLowSet.clear(); delete[] recvNs; // Petsc will be used to make the parallel job AOCreateMapping(PETSC_COMM_WORLD,N,apOrdering,petscOrdering,&ao); delete[] apOrdering; delete[] petscOrdering; }
int Zoltan_PHG_Gather_To_All_Procs( ZZ *zz, HGraph *phg, /* Input: Local part of distributed hypergraph */ PHGPartParams *hgp, /* Input: Hypergraph parameters */ PHGComm *scomm, /* Input: Serial PHGComm for use by shg. */ HGraph **gathered_hg /* Output: combined hypergraph combined to proc */ ) { /* * Function to gather distributed hypergraph onto each processor for * coarsest partitioning. * First hypergraph arrays for the hypergraph on a column of processors * are built using MPI_Allgathers down the processor columns. * These hypergraph arrays contain complete info about a subset of vertices. * Second the column hypergraphs are gathered along processor rows. * Each processor then has a complete description of the hypergraph. */ char *yo = "Zoltan_PHG_Gather_To_All_Procs"; int ierr = ZOLTAN_OK; int i, tmp, sum; int *each = NULL, *disp = NULL; /* Size and displacement arrays for MPI_Allgatherv */ int *send_buf = NULL; /* Buffer of values to be sent */ int send_size; /* Size of buffer send_buf */ int *col_vedge = NULL; /* vedge array for the proc-column hypergraph */ int *col_vindex = NULL; /* vindex array for the proc-column hypergraph */ int *col_hvertex = NULL; /* hvertex array for the proc-column hypergraph */ int *col_hindex = NULL; /* hindex array for the proc-column hypergraph */ int col_nVtx; /* Number of vertices in processor column */ int col_nEdge; /* Number of edges in processor column */ int col_nPin; /* Number of pins in processor column */ int *recv_size = NULL; /* nPins for each proc in col or row */ HGraph *shg; /* Pointer to the serial hypergraph to be returned by this function. */ int myProc_x = phg->comm->myProc_x; int nProc_x = phg->comm->nProc_x; int nProc_y = phg->comm->nProc_y; int max_nProc_xy = MAX(nProc_x, nProc_y); if (phg->comm->nProc == 1) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Do not call this routine on one proc."); return ZOLTAN_FATAL; } #ifdef KDDKDD_CHECK Zoltan_HG_Print(zz, phg, NULL, stdout, "GatherBefore");/* NULL parts for now; add non-NULL later */ #endif /****************************************************************** * 0. Allocate the hypergraph to be returned. * Set values that we already know. ******************************************************************/ shg = *gathered_hg = (HGraph *) ZOLTAN_MALLOC(sizeof(HGraph)); if (!shg) MEMORY_ERROR; Zoltan_HG_HGraph_Init(shg); shg->nVtx = phg->dist_x[nProc_x]; /* TODO64 - can this exceed 2B? */ shg->nEdge = phg->dist_y[nProc_y]; shg->dist_x = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC(2 * sizeof(ZOLTAN_GNO_TYPE)); shg->dist_y = (ZOLTAN_GNO_TYPE *) ZOLTAN_MALLOC(2 * sizeof(ZOLTAN_GNO_TYPE)); if (!shg->dist_x || !shg->dist_y) MEMORY_ERROR; shg->dist_x[0] = shg->dist_y[0] = 0; shg->dist_x[1] = shg->nVtx; shg->dist_y[1] = shg->nEdge; shg->comm = scomm; shg->EdgeWeightDim = phg->EdgeWeightDim; shg->VtxWeightDim = phg->VtxWeightDim; if (shg->VtxWeightDim && shg->nVtx) shg->vwgt = (float *) ZOLTAN_MALLOC(shg->nVtx * shg->VtxWeightDim * sizeof(float)); if (shg->EdgeWeightDim && shg->nEdge) shg->ewgt = (float *) ZOLTAN_MALLOC(shg->nEdge * shg->EdgeWeightDim * sizeof(float)); /* Fixed vertices */ shg->bisec_split = phg->bisec_split; if (hgp->UseFixedVtx) shg->fixed_part = (int *) ZOLTAN_MALLOC(shg->nVtx * sizeof(int)); if (hgp->UsePrefPart) shg->pref_part = (int *) ZOLTAN_MALLOC(shg->nVtx * sizeof(int)); /* Allocate arrays for use in gather operations */ recv_size = (int *) ZOLTAN_MALLOC(3 * max_nProc_xy * sizeof(int)); each = recv_size + max_nProc_xy; disp = each + max_nProc_xy; /* TODO64 - phg->dist_y[nProc_y] could exceed 2 Billion, NO? */ send_size = MAX(phg->dist_x[myProc_x+1] - phg->dist_x[myProc_x], phg->dist_y[nProc_y]); send_buf = (int *) ZOLTAN_MALLOC(send_size * sizeof(int)); if ((shg->VtxWeightDim && shg->nVtx && !shg->vwgt) || (shg->EdgeWeightDim && shg->nEdge && !shg->ewgt) || !recv_size || (send_size && !send_buf)) MEMORY_ERROR; /************************************************************* * 1. Gather all non-zeros for vertices in processor column * *************************************************************/ if (nProc_y == 1) { /* * Don't need a gather; just set pointers appropriately for row-gather * in Step 2 below. */ col_nVtx = phg->nVtx; col_nEdge = phg->nEdge; col_nPin = phg->nPins; col_vindex = phg->vindex; col_vedge = phg->vedge; col_hindex = phg->hindex; col_hvertex = phg->hvertex; for (i = 0; i < shg->EdgeWeightDim * shg->nEdge; i++) shg->ewgt[i] = phg->ewgt[i]; } else { /* Gather local size info for each proc in column */ MPI_Allgather(&(phg->nPins), 1, MPI_INT, recv_size, 1, MPI_INT, phg->comm->col_comm); /* Compute number of vtx, edge, and nnz in column */ col_nVtx = (int)(phg->dist_x[myProc_x+1] - phg->dist_x[myProc_x]); col_nEdge = phg->dist_y[nProc_y]; /* SCHEMEA */ col_nPin = 0; for (i = 0; i < nProc_y; i++) { col_nPin += recv_size[i]; } /* Allocate arrays for column hypergraph */ col_hindex = (int *) ZOLTAN_CALLOC((col_nEdge+1), sizeof(int)); col_hvertex = (int *) ZOLTAN_MALLOC(col_nPin * sizeof(int)); col_vindex = (int *) ZOLTAN_CALLOC((col_nVtx+1), sizeof(int)); col_vedge = (int *) ZOLTAN_MALLOC(col_nPin * sizeof(int)); if (!col_vindex || !col_hindex || (col_nPin && (!col_vedge || !col_hvertex))) MEMORY_ERROR; /* Gather hvertex data for all procs in column */ /* SCHEMEA uses same vertex LNO on each proc in column. */ /* SCHEMEB would require conversion from vertex LNO to GNO here. */ disp[0] = 0; for (i = 1; i < nProc_y; i++) disp[i] = disp[i-1] + recv_size[i-1]; MPI_Allgatherv(phg->hvertex, phg->nPins, MPI_INT, col_hvertex, recv_size, disp, MPI_INT, phg->comm->col_comm); /* SCHEMEA uses same vertex LNO on each proc in column. */ /* SCHEMEB would require conversion from vertex GNO to LNO here */ /* Gather hindex data for all procs in column */ for (i = 0; i < phg->nEdge; i++) send_buf[i] = phg->hindex[i+1] - phg->hindex[i]; /* SCHEMEA can assume a recv for each edge; * SCHEMEB needs to gather the number of edges recv'd from each proc. */ for (i = 0; i < nProc_y; i++) each[i] = phg->dist_y[i+1] - phg->dist_y[i]; disp[0] = 0; /* Can't use dist_y because it may not be sizeof(int) */ for (i=1; i < nProc_y; i++){ disp[i] = disp[i-1] + each[i-1]; } /* SCHEMEA can use phg->dist_y for displacement array. * SCHEMEB requires separate displacement array. */ MPI_Allgatherv(send_buf, phg->nEdge, MPI_INT, col_hindex, each, disp, MPI_INT, phg->comm->col_comm); /* Perform prefix sum on col_hindex */ sum = 0; for (i = 0; i < col_nEdge; i++) { tmp = col_hindex[i]; col_hindex[i] = sum; sum += tmp; } col_hindex[col_nEdge] = sum; /* Sanity check */ if (col_hindex[col_nEdge] != col_nPin) { printf("%d Sanity check failed: " "col_hindex[col_nEdge] %d != col_nPin %d\n", zz->Proc, col_hindex[col_nEdge], col_nPin); exit(-1); } /* Gather edge weights, if any. */ if (shg->EdgeWeightDim) { /* Can use nearly the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; each[0] *= phg->EdgeWeightDim; for (i = 1; i < nProc_y; i++) { each[i] *= phg->EdgeWeightDim; disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->ewgt, phg->nEdge*phg->EdgeWeightDim, MPI_FLOAT, shg->ewgt, each, disp, MPI_FLOAT, phg->comm->col_comm); } Zoltan_HG_Mirror(col_nEdge, col_hindex, col_hvertex, col_nVtx, col_vindex, col_vedge); } /* End column-gather */ /************************************************************* * 2. Gather all non-zeros for edges in processor rows * * All processors in a processor column now have the same * * hypergraph; we now gather it across rows. * *************************************************************/ if (nProc_x == 1) { /* * Don't need a gather across the row; just set pointers appropriately * in shg. */ shg->vindex = col_vindex; shg->vedge = col_vedge; shg->hindex = col_hindex; shg->hvertex = col_hvertex; /* Copy vwgt and fixed arrays so shg owns this memory */ for (i = 0; i < shg->VtxWeightDim*shg->nVtx; i++) shg->vwgt[i] = phg->vwgt[i]; if (hgp->UseFixedVtx) for (i = 0; i < shg->nVtx; i++) shg->fixed_part[i] = phg->fixed_part[i]; if (hgp->UsePrefPart) for (i = 0; i < shg->nVtx; i++) shg->pref_part[i] = phg->pref_part[i]; } else { /* Gather info about size within the row */ MPI_Allgather(&col_nPin, 1, MPI_INT, recv_size, 1, MPI_INT, phg->comm->row_comm); tmp = 0; for (i = 0; i < nProc_x; i++) tmp += recv_size[i]; shg->nPins = tmp; shg->vindex = (int *) ZOLTAN_CALLOC((shg->nVtx+1), sizeof(int)); shg->vedge = (int *) ZOLTAN_MALLOC(shg->nPins * sizeof(int)); shg->hindex = (int *) ZOLTAN_CALLOC((shg->nEdge+1), sizeof(int)); shg->hvertex = (int *) ZOLTAN_MALLOC(shg->nPins * sizeof(int)); if (!shg->vindex || !shg->hindex || (shg->nPins && (!shg->vedge || !shg->hvertex))) MEMORY_ERROR; /* Gather vedge data for all procs in row */ /* SCHEMEA can send local edge numbers; SCHEMEB requires edge LNO to GNO conversion. */ disp[0] = 0; for (i = 1; i < nProc_x; i++) disp[i] = disp[i-1] + recv_size[i-1]; MPI_Allgatherv(col_vedge, col_nPin, MPI_INT, shg->vedge, recv_size, disp, MPI_INT, phg->comm->row_comm); /* Gather vindex data for all procs in row */ for (i = 0; i < col_nVtx; i++) send_buf[i] = col_vindex[i+1] - col_vindex[i]; /* SCHEMEA can assume a recv for each vertex; * SCHEMEB would need to gather the number of vtxs recv'd from each proc. */ for (i = 0; i < nProc_x; i++) each[i] = (int)(phg->dist_x[i+1] - phg->dist_x[i]); disp[0] = 0; /* Can't use dist_x, may not be sizeof(int) */ for (i = 1; i < nProc_x; i++) disp[i] = disp[i-1] + each[i-1]; /* SCHEMEA can use phg->dist_x as displacement array; * SCHEMEB requires separate displacement array. */ MPI_Allgatherv(send_buf, col_nVtx, MPI_INT, shg->vindex, each, disp, MPI_INT, phg->comm->row_comm); /* Perform prefix sum on shg->vindex */ sum = 0; for (i = 0; i < shg->nVtx; i++) { tmp = shg->vindex[i]; shg->vindex[i] = sum; sum += tmp; } shg->vindex[shg->nVtx] = sum; /* Sanity check */ if (shg->vindex[shg->nVtx] != shg->nPins) { printf("%d Sanity check failed: " "shg->vindex %d != nPins %d\n", zz->Proc, shg->vindex[shg->nVtx], shg->nPins); exit(-1); } /* Gather fixed array, if any */ if (hgp->UseFixedVtx){ #ifdef DEBUG_ uprintf(phg->comm, "Debug in PHG_gather before gather. phg->fixed ="); for (i=0; i<phg->nVtx; i++){ printf(" %d ", phg->fixed_part[i]); } printf("\n"); #endif /* Can use the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; for (i = 1; i < nProc_x; i++) { disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->fixed_part, phg->nVtx, MPI_FLOAT, shg->fixed_part, each, disp, MPI_FLOAT, phg->comm->row_comm); #ifdef DEBUG_ uprintf(phg->comm, "Debug in PHG_gather after gather. shg->fixed ="); for (i=0; i<shg->nVtx; i++){ printf(" %d ", shg->fixed_part[i]); } printf("\n"); #endif } /* Gather pref part array, if any */ if (hgp->UsePrefPart){ /* Can use the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; for (i = 1; i < nProc_x; i++) { disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->pref_part, phg->nVtx, MPI_FLOAT, shg->pref_part, each, disp, MPI_FLOAT, phg->comm->row_comm); } /* Gather vertex weights, if any. */ if (shg->VtxWeightDim) { /* Can use nearly the same each array. */ /* Need to compute new disp array. */ disp[0] = 0; each[0] *= phg->VtxWeightDim; for (i = 1; i < nProc_x; i++) { each[i] *= phg->VtxWeightDim; disp[i] = disp[i-1] + each[i-1]; } MPI_Allgatherv(phg->vwgt, phg->nVtx*phg->VtxWeightDim, MPI_FLOAT, shg->vwgt, each, disp, MPI_FLOAT, phg->comm->row_comm); } Zoltan_HG_Mirror(shg->nVtx, shg->vindex, shg->vedge, shg->nEdge, shg->hindex, shg->hvertex); } /* End row gather */ #ifdef KDDKDD_CHECK Zoltan_HG_Print(zz, shg, NULL, stdout, "GatherAfter");/* NULL parts for now; add non-NULL later */ Zoltan_PHG_Plot_2D_Distrib(zz, phg); Zoltan_PHG_Plot_2D_Distrib(zz, shg); #endif End: if (ierr < 0) { Zoltan_HG_HGraph_Free(*gathered_hg); ZOLTAN_FREE(gathered_hg); } Zoltan_Multifree(__FILE__, __LINE__, 2, &send_buf, &recv_size); if (nProc_x > 1 && nProc_y > 1) Zoltan_Multifree(__FILE__, __LINE__, 4, &col_vedge, &col_vindex, &col_hvertex, &col_hindex); return ierr; }
PetscErrorCode DMSetUp_DA_2D(DM da) { DM_DA *dd = (DM_DA*)da->data; const PetscInt M = dd->M; const PetscInt N = dd->N; PetscInt m = dd->m; PetscInt n = dd->n; const PetscInt dof = dd->w; const PetscInt s = dd->s; DMDABoundaryType bx = dd->bx; DMDABoundaryType by = dd->by; DMDAStencilType stencil_type = dd->stencil_type; PetscInt *lx = dd->lx; PetscInt *ly = dd->ly; MPI_Comm comm; PetscMPIInt rank,size; PetscInt xs,xe,ys,ye,x,y,Xs,Xe,Ys,Ye,start,end,IXs,IXe,IYs,IYe; PetscInt up,down,left,right,i,n0,n1,n2,n3,n5,n6,n7,n8,*idx,nn,*idx_cpy; const PetscInt *idx_full; PetscInt xbase,*bases,*ldims,j,x_t,y_t,s_t,base,count; PetscInt s_x,s_y; /* s proportionalized to w */ PetscInt sn0 = 0,sn2 = 0,sn6 = 0,sn8 = 0; Vec local,global; VecScatter ltog,gtol; IS to,from,ltogis; PetscErrorCode ierr; PetscFunctionBegin; if (stencil_type == DMDA_STENCIL_BOX && (bx == DMDA_BOUNDARY_MIRROR || by == DMDA_BOUNDARY_MIRROR)) SETERRQ(PetscObjectComm((PetscObject)da),PETSC_ERR_SUP,"Mirror boundary and box stencil"); ierr = PetscObjectGetComm((PetscObject)da,&comm);CHKERRQ(ierr); #if !defined(PETSC_USE_64BIT_INDICES) if (((Petsc64bitInt) M)*((Petsc64bitInt) N)*((Petsc64bitInt) dof) > (Petsc64bitInt) PETSC_MPI_INT_MAX) SETERRQ3(comm,PETSC_ERR_INT_OVERFLOW,"Mesh of %D by %D by %D (dof) is too large for 32 bit indices",M,N,dof); #endif if (dof < 1) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Must have 1 or more degrees of freedom per node: %D",dof); if (s < 0) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Stencil width cannot be negative: %D",s); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); if (m != PETSC_DECIDE) { if (m < 1) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Non-positive number of processors in X direction: %D",m); else if (m > size) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Too many processors in X direction: %D %d",m,size); } if (n != PETSC_DECIDE) { if (n < 1) SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Non-positive number of processors in Y direction: %D",n); else if (n > size) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Too many processors in Y direction: %D %d",n,size); } if (m == PETSC_DECIDE || n == PETSC_DECIDE) { if (n != PETSC_DECIDE) { m = size/n; } else if (m != PETSC_DECIDE) { n = size/m; } else { /* try for squarish distribution */ m = (PetscInt)(0.5 + PetscSqrtReal(((PetscReal)M)*((PetscReal)size)/((PetscReal)N))); if (!m) m = 1; while (m > 0) { n = size/m; if (m*n == size) break; m--; } if (M > N && m < n) {PetscInt _m = m; m = n; n = _m;} } if (m*n != size) SETERRQ(comm,PETSC_ERR_PLIB,"Unable to create partition, check the size of the communicator and input m and n "); } else if (m*n != size) SETERRQ(comm,PETSC_ERR_ARG_OUTOFRANGE,"Given Bad partition"); if (M < m) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Partition in x direction is too fine! %D %D",M,m); if (N < n) SETERRQ2(comm,PETSC_ERR_ARG_OUTOFRANGE,"Partition in y direction is too fine! %D %D",N,n); /* Determine locally owned region xs is the first local node number, x is the number of local nodes */ if (!lx) { ierr = PetscMalloc(m*sizeof(PetscInt), &dd->lx);CHKERRQ(ierr); lx = dd->lx; for (i=0; i<m; i++) { lx[i] = M/m + ((M % m) > i); } } x = lx[rank % m]; xs = 0; for (i=0; i<(rank % m); i++) { xs += lx[i]; } #if defined(PETSC_USE_DEBUG) left = xs; for (i=(rank % m); i<m; i++) { left += lx[i]; } if (left != M) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Sum of lx across processors not equal to M: %D %D",left,M); #endif /* Determine locally owned region ys is the first local node number, y is the number of local nodes */ if (!ly) { ierr = PetscMalloc(n*sizeof(PetscInt), &dd->ly);CHKERRQ(ierr); ly = dd->ly; for (i=0; i<n; i++) { ly[i] = N/n + ((N % n) > i); } } y = ly[rank/m]; ys = 0; for (i=0; i<(rank/m); i++) { ys += ly[i]; } #if defined(PETSC_USE_DEBUG) left = ys; for (i=(rank/m); i<n; i++) { left += ly[i]; } if (left != N) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Sum of ly across processors not equal to N: %D %D",left,N); #endif /* check if the scatter requires more than one process neighbor or wraps around the domain more than once */ if ((x < s) && ((m > 1) || (bx == DMDA_BOUNDARY_PERIODIC))) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Local x-width of domain x %D is smaller than stencil width s %D",x,s); if ((y < s) && ((n > 1) || (by == DMDA_BOUNDARY_PERIODIC))) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Local y-width of domain y %D is smaller than stencil width s %D",y,s); xe = xs + x; ye = ys + y; /* determine ghost region (Xs) and region scattered into (IXs) */ if (xs-s > 0) { Xs = xs - s; IXs = xs - s; } else { if (bx) { Xs = xs - s; } else { Xs = 0; } IXs = 0; } if (xe+s <= M) { Xe = xe + s; IXe = xe + s; } else { if (bx) { Xs = xs - s; Xe = xe + s; } else { Xe = M; } IXe = M; } if (bx == DMDA_BOUNDARY_PERIODIC || bx == DMDA_BOUNDARY_MIRROR) { IXs = xs - s; IXe = xe + s; Xs = xs - s; Xe = xe + s; } if (ys-s > 0) { Ys = ys - s; IYs = ys - s; } else { if (by) { Ys = ys - s; } else { Ys = 0; } IYs = 0; } if (ye+s <= N) { Ye = ye + s; IYe = ye + s; } else { if (by) { Ye = ye + s; } else { Ye = N; } IYe = N; } if (by == DMDA_BOUNDARY_PERIODIC || by == DMDA_BOUNDARY_MIRROR) { IYs = ys - s; IYe = ye + s; Ys = ys - s; Ye = ye + s; } /* stencil length in each direction */ s_x = s; s_y = s; /* determine starting point of each processor */ nn = x*y; ierr = PetscMalloc2(size+1,PetscInt,&bases,size,PetscInt,&ldims);CHKERRQ(ierr); ierr = MPI_Allgather(&nn,1,MPIU_INT,ldims,1,MPIU_INT,comm);CHKERRQ(ierr); bases[0] = 0; for (i=1; i<=size; i++) { bases[i] = ldims[i-1]; } for (i=1; i<=size; i++) { bases[i] += bases[i-1]; } base = bases[rank]*dof; /* allocate the base parallel and sequential vectors */ dd->Nlocal = x*y*dof; ierr = VecCreateMPIWithArray(comm,dof,dd->Nlocal,PETSC_DECIDE,0,&global);CHKERRQ(ierr); dd->nlocal = (Xe-Xs)*(Ye-Ys)*dof; ierr = VecCreateSeqWithArray(PETSC_COMM_SELF,dof,dd->nlocal,0,&local);CHKERRQ(ierr); /* generate appropriate vector scatters */ /* local to global inserts non-ghost point region into global */ ierr = VecGetOwnershipRange(global,&start,&end);CHKERRQ(ierr); ierr = ISCreateStride(comm,x*y*dof,start,1,&to);CHKERRQ(ierr); ierr = PetscMalloc(x*y*sizeof(PetscInt),&idx);CHKERRQ(ierr); left = xs - Xs; right = left + x; down = ys - Ys; up = down + y; count = 0; for (i=down; i<up; i++) { for (j=left; j<right; j++) { idx[count++] = i*(Xe-Xs) + j; } } ierr = ISCreateBlock(comm,dof,count,idx,PETSC_OWN_POINTER,&from);CHKERRQ(ierr); ierr = VecScatterCreate(local,from,global,to,<og);CHKERRQ(ierr); ierr = PetscLogObjectParent(dd,ltog);CHKERRQ(ierr); ierr = ISDestroy(&from);CHKERRQ(ierr); ierr = ISDestroy(&to);CHKERRQ(ierr); /* global to local must include ghost points within the domain, but not ghost points outside the domain that aren't periodic */ if (stencil_type == DMDA_STENCIL_BOX) { count = (IXe-IXs)*(IYe-IYs); ierr = PetscMalloc(count*sizeof(PetscInt),&idx);CHKERRQ(ierr); left = IXs - Xs; right = left + (IXe-IXs); down = IYs - Ys; up = down + (IYe-IYs); count = 0; for (i=down; i<up; i++) { for (j=left; j<right; j++) { idx[count++] = j + i*(Xe-Xs); } } ierr = ISCreateBlock(comm,dof,count,idx,PETSC_OWN_POINTER,&to);CHKERRQ(ierr); } else { /* must drop into cross shape region */ /* ---------| | top | |--- ---| up | middle | | | ---- ---- down | bottom | ----------- Xs xs xe Xe */ count = (ys-IYs)*x + y*(IXe-IXs) + (IYe-ye)*x; ierr = PetscMalloc(count*sizeof(PetscInt),&idx);CHKERRQ(ierr); left = xs - Xs; right = left + x; down = ys - Ys; up = down + y; count = 0; /* bottom */ for (i=(IYs-Ys); i<down; i++) { for (j=left; j<right; j++) { idx[count++] = j + i*(Xe-Xs); } } /* middle */ for (i=down; i<up; i++) { for (j=(IXs-Xs); j<(IXe-Xs); j++) { idx[count++] = j + i*(Xe-Xs); } } /* top */ for (i=up; i<up+IYe-ye; i++) { for (j=left; j<right; j++) { idx[count++] = j + i*(Xe-Xs); } } ierr = ISCreateBlock(comm,dof,count,idx,PETSC_OWN_POINTER,&to);CHKERRQ(ierr); } /* determine who lies on each side of us stored in n6 n7 n8 n3 n5 n0 n1 n2 */ /* Assume the Non-Periodic Case */ n1 = rank - m; if (rank % m) { n0 = n1 - 1; } else { n0 = -1; } if ((rank+1) % m) { n2 = n1 + 1; n5 = rank + 1; n8 = rank + m + 1; if (n8 >= m*n) n8 = -1; } else { n2 = -1; n5 = -1; n8 = -1; } if (rank % m) { n3 = rank - 1; n6 = n3 + m; if (n6 >= m*n) n6 = -1; } else { n3 = -1; n6 = -1; } n7 = rank + m; if (n7 >= m*n) n7 = -1; if (bx == DMDA_BOUNDARY_PERIODIC && by == DMDA_BOUNDARY_PERIODIC) { /* Modify for Periodic Cases */ /* Handle all four corners */ if ((n6 < 0) && (n7 < 0) && (n3 < 0)) n6 = m-1; if ((n8 < 0) && (n7 < 0) && (n5 < 0)) n8 = 0; if ((n2 < 0) && (n5 < 0) && (n1 < 0)) n2 = size-m; if ((n0 < 0) && (n3 < 0) && (n1 < 0)) n0 = size-1; /* Handle Top and Bottom Sides */ if (n1 < 0) n1 = rank + m * (n-1); if (n7 < 0) n7 = rank - m * (n-1); if ((n3 >= 0) && (n0 < 0)) n0 = size - m + rank - 1; if ((n3 >= 0) && (n6 < 0)) n6 = (rank%m)-1; if ((n5 >= 0) && (n2 < 0)) n2 = size - m + rank + 1; if ((n5 >= 0) && (n8 < 0)) n8 = (rank%m)+1; /* Handle Left and Right Sides */ if (n3 < 0) n3 = rank + (m-1); if (n5 < 0) n5 = rank - (m-1); if ((n1 >= 0) && (n0 < 0)) n0 = rank-1; if ((n1 >= 0) && (n2 < 0)) n2 = rank-2*m+1; if ((n7 >= 0) && (n6 < 0)) n6 = rank+2*m-1; if ((n7 >= 0) && (n8 < 0)) n8 = rank+1; } else if (by == DMDA_BOUNDARY_PERIODIC) { /* Handle Top and Bottom Sides */ if (n1 < 0) n1 = rank + m * (n-1); if (n7 < 0) n7 = rank - m * (n-1); if ((n3 >= 0) && (n0 < 0)) n0 = size - m + rank - 1; if ((n3 >= 0) && (n6 < 0)) n6 = (rank%m)-1; if ((n5 >= 0) && (n2 < 0)) n2 = size - m + rank + 1; if ((n5 >= 0) && (n8 < 0)) n8 = (rank%m)+1; } else if (bx == DMDA_BOUNDARY_PERIODIC) { /* Handle Left and Right Sides */ if (n3 < 0) n3 = rank + (m-1); if (n5 < 0) n5 = rank - (m-1); if ((n1 >= 0) && (n0 < 0)) n0 = rank-1; if ((n1 >= 0) && (n2 < 0)) n2 = rank-2*m+1; if ((n7 >= 0) && (n6 < 0)) n6 = rank+2*m-1; if ((n7 >= 0) && (n8 < 0)) n8 = rank+1; } ierr = PetscMalloc(9*sizeof(PetscInt),&dd->neighbors);CHKERRQ(ierr); dd->neighbors[0] = n0; dd->neighbors[1] = n1; dd->neighbors[2] = n2; dd->neighbors[3] = n3; dd->neighbors[4] = rank; dd->neighbors[5] = n5; dd->neighbors[6] = n6; dd->neighbors[7] = n7; dd->neighbors[8] = n8; if (stencil_type == DMDA_STENCIL_STAR) { /* save corner processor numbers */ sn0 = n0; sn2 = n2; sn6 = n6; sn8 = n8; n0 = n2 = n6 = n8 = -1; } ierr = PetscMalloc((Xe-Xs)*(Ye-Ys)*sizeof(PetscInt),&idx);CHKERRQ(ierr); ierr = PetscLogObjectMemory(da,(Xe-Xs)*(Ye-Ys)*sizeof(PetscInt));CHKERRQ(ierr); nn = 0; xbase = bases[rank]; for (i=1; i<=s_y; i++) { if (n0 >= 0) { /* left below */ x_t = lx[n0 % m]; y_t = ly[(n0/m)]; s_t = bases[n0] + x_t*y_t - (s_y-i)*x_t - s_x; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } if (n1 >= 0) { /* directly below */ x_t = x; y_t = ly[(n1/m)]; s_t = bases[n1] + x_t*y_t - (s_y+1-i)*x_t; for (j=0; j<x_t; j++) idx[nn++] = s_t++; } else if (by == DMDA_BOUNDARY_MIRROR) { for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(s_y - i + 1) + j; } if (n2 >= 0) { /* right below */ x_t = lx[n2 % m]; y_t = ly[(n2/m)]; s_t = bases[n2] + x_t*y_t - (s_y+1-i)*x_t; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } } for (i=0; i<y; i++) { if (n3 >= 0) { /* directly left */ x_t = lx[n3 % m]; /* y_t = y; */ s_t = bases[n3] + (i+1)*x_t - s_x; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (bx == DMDA_BOUNDARY_MIRROR) { for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*i + s_x - j; } for (j=0; j<x; j++) idx[nn++] = xbase++; /* interior */ if (n5 >= 0) { /* directly right */ x_t = lx[n5 % m]; /* y_t = y; */ s_t = bases[n5] + (i)*x_t; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (bx == DMDA_BOUNDARY_MIRROR) { for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*(i + 1) - 2 - j; } } for (i=1; i<=s_y; i++) { if (n6 >= 0) { /* left above */ x_t = lx[n6 % m]; /* y_t = ly[(n6/m)]; */ s_t = bases[n6] + (i)*x_t - s_x; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } if (n7 >= 0) { /* directly above */ x_t = x; /* y_t = ly[(n7/m)]; */ s_t = bases[n7] + (i-1)*x_t; for (j=0; j<x_t; j++) idx[nn++] = s_t++; } else if (by == DMDA_BOUNDARY_MIRROR) { for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(y - i - 1) + j; } if (n8 >= 0) { /* right above */ x_t = lx[n8 % m]; /* y_t = ly[(n8/m)]; */ s_t = bases[n8] + (i-1)*x_t; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } } ierr = ISCreateBlock(comm,dof,nn,idx,PETSC_COPY_VALUES,&from);CHKERRQ(ierr); ierr = VecScatterCreate(global,from,local,to,>ol);CHKERRQ(ierr); ierr = PetscLogObjectParent(da,gtol);CHKERRQ(ierr); ierr = ISDestroy(&to);CHKERRQ(ierr); ierr = ISDestroy(&from);CHKERRQ(ierr); if (stencil_type == DMDA_STENCIL_STAR) { n0 = sn0; n2 = sn2; n6 = sn6; n8 = sn8; } if (((stencil_type == DMDA_STENCIL_STAR) || (bx && bx != DMDA_BOUNDARY_PERIODIC) || (by && by != DMDA_BOUNDARY_PERIODIC))) { /* Recompute the local to global mappings, this time keeping the information about the cross corner processor numbers and any ghosted but not periodic indices. */ nn = 0; xbase = bases[rank]; for (i=1; i<=s_y; i++) { if (n0 >= 0) { /* left below */ x_t = lx[n0 % m]; y_t = ly[(n0/m)]; s_t = bases[n0] + x_t*y_t - (s_y-i)*x_t - s_x; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (xs-Xs > 0 && ys-Ys > 0) { for (j=0; j<s_x; j++) idx[nn++] = -1; } if (n1 >= 0) { /* directly below */ x_t = x; y_t = ly[(n1/m)]; s_t = bases[n1] + x_t*y_t - (s_y+1-i)*x_t; for (j=0; j<x_t; j++) idx[nn++] = s_t++; } else if (ys-Ys > 0) { if (by == DMDA_BOUNDARY_MIRROR) { for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(s_y - i + 1) + j; } else { for (j=0; j<x; j++) idx[nn++] = -1; } } if (n2 >= 0) { /* right below */ x_t = lx[n2 % m]; y_t = ly[(n2/m)]; s_t = bases[n2] + x_t*y_t - (s_y+1-i)*x_t; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (Xe-xe> 0 && ys-Ys > 0) { for (j=0; j<s_x; j++) idx[nn++] = -1; } } for (i=0; i<y; i++) { if (n3 >= 0) { /* directly left */ x_t = lx[n3 % m]; /* y_t = y; */ s_t = bases[n3] + (i+1)*x_t - s_x; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (xs-Xs > 0) { if (bx == DMDA_BOUNDARY_MIRROR) { for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*i + s_x - j; } else { for (j=0; j<s_x; j++) idx[nn++] = -1; } } for (j=0; j<x; j++) idx[nn++] = xbase++; /* interior */ if (n5 >= 0) { /* directly right */ x_t = lx[n5 % m]; /* y_t = y; */ s_t = bases[n5] + (i)*x_t; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (Xe-xe > 0) { if (bx == DMDA_BOUNDARY_MIRROR) { for (j=0; j<s_x; j++) idx[nn++] = bases[rank] + x*(i + 1) - 2 - j; } else { for (j=0; j<s_x; j++) idx[nn++] = -1; } } } for (i=1; i<=s_y; i++) { if (n6 >= 0) { /* left above */ x_t = lx[n6 % m]; /* y_t = ly[(n6/m)]; */ s_t = bases[n6] + (i)*x_t - s_x; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (xs-Xs > 0 && Ye-ye > 0) { for (j=0; j<s_x; j++) idx[nn++] = -1; } if (n7 >= 0) { /* directly above */ x_t = x; /* y_t = ly[(n7/m)]; */ s_t = bases[n7] + (i-1)*x_t; for (j=0; j<x_t; j++) idx[nn++] = s_t++; } else if (Ye-ye > 0) { if (by == DMDA_BOUNDARY_MIRROR) { for (j=0; j<x; j++) idx[nn++] = bases[rank] + x*(y - i - 1) + j; } else { for (j=0; j<x; j++) idx[nn++] = -1; } } if (n8 >= 0) { /* right above */ x_t = lx[n8 % m]; /* y_t = ly[(n8/m)]; */ s_t = bases[n8] + (i-1)*x_t; for (j=0; j<s_x; j++) idx[nn++] = s_t++; } else if (Xe-xe > 0 && Ye-ye > 0) { for (j=0; j<s_x; j++) idx[nn++] = -1; } } } /* Set the local to global ordering in the global vector, this allows use of VecSetValuesLocal(). */ ierr = ISCreateBlock(comm,dof,nn,idx,PETSC_OWN_POINTER,<ogis);CHKERRQ(ierr); ierr = PetscMalloc(nn*dof*sizeof(PetscInt),&idx_cpy);CHKERRQ(ierr); ierr = PetscLogObjectMemory(da,nn*dof*sizeof(PetscInt));CHKERRQ(ierr); ierr = ISGetIndices(ltogis, &idx_full);CHKERRQ(ierr); ierr = PetscMemcpy(idx_cpy,idx_full,nn*dof*sizeof(PetscInt));CHKERRQ(ierr); ierr = ISRestoreIndices(ltogis, &idx_full);CHKERRQ(ierr); ierr = ISLocalToGlobalMappingCreateIS(ltogis,&da->ltogmap);CHKERRQ(ierr); ierr = PetscLogObjectParent(da,da->ltogmap);CHKERRQ(ierr); ierr = ISDestroy(<ogis);CHKERRQ(ierr); ierr = ISLocalToGlobalMappingBlock(da->ltogmap,dd->w,&da->ltogmapb);CHKERRQ(ierr); ierr = PetscLogObjectParent(da,da->ltogmap);CHKERRQ(ierr); ierr = PetscFree2(bases,ldims);CHKERRQ(ierr); dd->m = m; dd->n = n; /* note petsc expects xs/xe/Xs/Xe to be multiplied by #dofs in many places */ dd->xs = xs*dof; dd->xe = xe*dof; dd->ys = ys; dd->ye = ye; dd->zs = 0; dd->ze = 1; dd->Xs = Xs*dof; dd->Xe = Xe*dof; dd->Ys = Ys; dd->Ye = Ye; dd->Zs = 0; dd->Ze = 1; ierr = VecDestroy(&local);CHKERRQ(ierr); ierr = VecDestroy(&global);CHKERRQ(ierr); dd->gtol = gtol; dd->ltog = ltog; dd->idx = idx_cpy; dd->Nl = nn*dof; dd->base = base; da->ops->view = DMView_DA_2d; dd->ltol = NULL; dd->ao = NULL; PetscFunctionReturn(0); }
//---------------------------------------------------------------------------// int all_gatherv(Node &send_node, Node &recv_node, MPI_Comm mpi_comm) { Node n_snd_compact; send_node.compact_to(n_snd_compact); int m_size = mpi::size(mpi_comm); std::string schema_str = n_snd_compact.schema().to_json(); int schema_len = schema_str.length() + 1; int data_len = n_snd_compact.total_bytes_compact(); // to do the conduit gatherv, first need a gather to get the // schema and data buffer sizes int snd_sizes[] = {schema_len, data_len}; Node n_rcv_sizes; Schema s; s["schema_len"].set(DataType::c_int()); s["data_len"].set(DataType::c_int()); n_rcv_sizes.list_of(s,m_size); int mpi_error = MPI_Allgather( snd_sizes, // local data 2, // two ints per rank MPI_INT, // send ints n_rcv_sizes.data_ptr(), // rcv buffer 2, // two ints per rank MPI_INT, // rcv ints mpi_comm); // mpi com CONDUIT_CHECK_MPI_ERROR(mpi_error); Node n_rcv_tmp; int *schema_rcv_counts = NULL; int *schema_rcv_displs = NULL; char *schema_rcv_buff = NULL; int *data_rcv_counts = NULL; int *data_rcv_displs = NULL; char *data_rcv_buff = NULL; // alloc data for the mpi gather counts and displ arrays n_rcv_tmp["schemas/counts"].set(DataType::c_int(m_size)); n_rcv_tmp["schemas/displs"].set(DataType::c_int(m_size)); n_rcv_tmp["data/counts"].set(DataType::c_int(m_size)); n_rcv_tmp["data/displs"].set(DataType::c_int(m_size)); // get pointers to counts and displs schema_rcv_counts = n_rcv_tmp["schemas/counts"].value(); schema_rcv_displs = n_rcv_tmp["schemas/displs"].value(); data_rcv_counts = n_rcv_tmp["data/counts"].value(); data_rcv_displs = n_rcv_tmp["data/displs"].value(); int schema_curr_displ = 0; int data_curr_displ = 0; int i=0; NodeIterator itr = n_rcv_sizes.children(); while(itr.has_next()) { Node &curr = itr.next(); int schema_curr_count = curr["schema_len"].value(); int data_curr_count = curr["data_len"].value(); schema_rcv_counts[i] = schema_curr_count; schema_rcv_displs[i] = schema_curr_displ; schema_curr_displ += schema_curr_count; data_rcv_counts[i] = data_curr_count; data_rcv_displs[i] = data_curr_displ; data_curr_displ += data_curr_count; i++; } n_rcv_tmp["schemas/data"].set(DataType::c_char(schema_curr_displ)); schema_rcv_buff = n_rcv_tmp["schemas/data"].value(); mpi_error = MPI_Allgatherv( const_cast <char*>(schema_str.c_str()), schema_len, MPI_CHAR, schema_rcv_buff, schema_rcv_counts, schema_rcv_displs, MPI_CHAR, mpi_comm); CONDUIT_CHECK_MPI_ERROR(mpi_error); // build all schemas from JSON, compact them. Schema rcv_schema; //TODO: should we make it easer to create a compact schema? Schema s_tmp; for(int i=0;i < m_size; i++) { Schema &s = s_tmp.append(); s.set(&schema_rcv_buff[schema_rcv_displs[i]]); } s_tmp.compact_to(rcv_schema); // allocate data to hold the gather result recv_node.set(rcv_schema); data_rcv_buff = (char*)recv_node.data_ptr(); mpi_error = MPI_Allgatherv( n_snd_compact.data_ptr(), data_len, MPI_CHAR, data_rcv_buff, data_rcv_counts, data_rcv_displs, MPI_CHAR, mpi_comm); CONDUIT_CHECK_MPI_ERROR(mpi_error); return mpi_error; }
int main(int argc, char* argv[]) { int status = MPI_SUCCESS; pami_result_t result = PAMI_ERROR; int provided = MPI_THREAD_SINGLE; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided); /* IBM: --ranks-per-node 64 fails to init threads but this */ /* IBM: testcase doesn't really care so don't exit */ TEST_ASSERT((provided>=MPI_THREAD_MULTIPLE),"MPI_Init_thread"); /* initialize the second client */ char * clientname = "test"; /* IBM: PE PAMI requires a client name */ pami_client_t client; result = PAMI_Client_create(clientname, &client, NULL, 0); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Client_create"); /* query properties of the client */ pami_configuration_t config[3]; size_t num_contexts; config[0].name = PAMI_CLIENT_NUM_TASKS; config[1].name = PAMI_CLIENT_TASK_ID; config[2].name = PAMI_CLIENT_NUM_CONTEXTS; result = PAMI_Client_query(client, config, 3); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Client_query"); world_size = config[0].value.intval; world_rank = config[1].value.intval; num_contexts = config[2].value.intval; if (world_rank==0) printf("hello world from rank %ld of %ld, number of contexts %zu \n", world_rank, world_size, num_contexts );/*IBM: debug num_contexts */ fflush(stdout); /* initialize the contexts */ pami_context_t * contexts; contexts = (pami_context_t *) safemalloc( num_contexts * sizeof(pami_context_t) ); result = PAMI_Context_createv( client, NULL, 0, contexts, num_contexts ); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_createv"); /************************************************************************/ /* IBM: Updating the test with the assumption that we will Rput the */ /* IBM: local byte array to our neighbor's shared byte array. */ int n = (argc>1 ? atoi(argv[1]) : 1000); size_t bytes = n * sizeof(int), bytes_out;/* IBM: debug - scale up testing */ int * shared = (int *) safemalloc(bytes); for (int i=0; i<n; i++) shared[i] = -1; /*IBM: initialize with -1, replaced with neighbor's rank */ pami_memregion_t shared_mr; result = PAMI_Memregion_create(contexts[0], shared, bytes, &bytes_out, &shared_mr); TEST_ASSERT(result == PAMI_SUCCESS && bytes==bytes_out,"PAMI_Memregion_create"); int * local = (int *) safemalloc(bytes); for (int i=0; i<n; i++) local[i] = world_rank; /*IBM: initialize with our rank */ pami_memregion_t local_mr; result = PAMI_Memregion_create(contexts[0], local, bytes, &bytes_out, &local_mr); /* IBM: local */ TEST_ASSERT(result == PAMI_SUCCESS && bytes==bytes_out,"PAMI_Memregion_create"); status = MPI_Barrier(MPI_COMM_WORLD); TEST_ASSERT(result == MPI_SUCCESS,"MPI_Barrier"); pami_memregion_t * shmrs = (pami_memregion_t *) safemalloc( world_size * sizeof(pami_memregion_t) ); status = MPI_Allgather(&shared_mr, sizeof(pami_memregion_t), MPI_BYTE, shmrs, sizeof(pami_memregion_t), MPI_BYTE, MPI_COMM_WORLD); TEST_ASSERT(result == MPI_SUCCESS,"MPI_Allgather"); int target = (world_rank>0 ? world_rank-1 : world_size-1); pami_endpoint_t target_ep; result = PAMI_Endpoint_create(client, (pami_task_t) target, 0, &target_ep); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Endpoint_create"); int active = 2; pami_rput_simple_t parameters; parameters.rma.dest = target_ep; parameters.rma.bytes = bytes; parameters.rma.cookie = &active; parameters.rma.done_fn = cb_done; parameters.rdma.local.mr = &local_mr; parameters.rdma.local.offset = 0; parameters.rdma.remote.mr = &shmrs[target]; /*IBM: target's mem region */ parameters.rdma.remote.offset = 0; parameters.put.rdone_fn = cb_done; result = PAMI_Rput(contexts[0], ¶meters); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Rput"); while (active) { //result = PAMI_Context_advance( contexts[0], 100); //TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_advance"); result = PAMI_Context_trylock_advancev(contexts, 1, 1000); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_trylock_advancev"); } /* IBM: I'm done with Rput but my world_rank + 1 neighbor might not be so need to advance */ /* IBM: Could do a barrier or send/recv a completion message instead ....*/ active = 10; /* IBM: Arbitrary - advance some more - 10*10000 good enough? */ while (--active) /* IBM*/ { /* IBM*/ result = PAMI_Context_trylock_advancev(contexts, 1, 10000); /* IBM*/ /*TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_trylock_advancev");*/ /* IBM*/ } /* IBM*/ int errors = 0; target = (world_rank<(world_size-1) ? world_rank+1 : 0); for (int i=0; i<n; i++) if ((shared[i] != target) || (local[i] != world_rank)) /*IBM: also verify didn't change local */ errors++; if (errors>0) { printf("%ld: %d errors :-( \n", (long)world_rank, errors); /*IBM: grep "errors" in scaled up output */ for (int i=0; i<n; i++) printf("%ld: local[%d] = %d , shared[%d] = %d (%d) \n", (long)world_rank, i, local[i], i, shared[i], target); /*IBM: print both arrays */ } else printf("%ld: no errors :-) \n", (long)world_rank); MPI_Barrier(MPI_COMM_WORLD); result = PAMI_Memregion_destroy(contexts[0], &shared_mr); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Memregion_destroy"); result = PAMI_Memregion_destroy(contexts[0], &local_mr); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Memregion_destroy"); free(shmrs); free(local); free(shared); /************************************************************************/ /* finalize the contexts */ result = PAMI_Context_destroyv( contexts, num_contexts ); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Context_destroyv"); free(contexts); /* finalize the client */ result = PAMI_Client_destroy( &client ); TEST_ASSERT(result == PAMI_SUCCESS,"PAMI_Client_destroy"); status = MPI_Barrier(MPI_COMM_WORLD); TEST_ASSERT(result == MPI_SUCCESS,"MPI_Barrier"); MPI_Finalize(); if (world_rank==0) printf("%ld: end of test \n", world_rank ); fflush(stdout); return 0; }
/************************************************************************* * This function compacts a graph by removing the vertex separator **************************************************************************/ void CompactGraph(CtrlType *ctrl, GraphType *graph, idxtype *perm, WorkSpaceType *wspace) { int i, j, l, nvtxs, cnvtxs, cfirstvtx, nparts, npes; idxtype *xadj, *ladjncy, *adjwgt, *vtxdist, *where; idxtype *cmap, *cvtxdist, *newwhere; nparts = ctrl->nparts; npes = ctrl->npes; nvtxs = graph->nvtxs; xadj = graph->xadj; ladjncy = graph->adjncy; adjwgt = graph->adjwgt; where = graph->where; if (graph->cmap == NULL) graph->cmap = idxmalloc(nvtxs+graph->nrecv, "CompactGraph: cmap"); cmap = graph->cmap; vtxdist = graph->vtxdist; /************************************************************* * Construct the cvtxdist of the contracted graph. Uses the fact * that lpwgts stores the local non separator vertices. **************************************************************/ cvtxdist = wspace->pv1; cnvtxs = cvtxdist[npes] = idxsum(nparts, graph->lpwgts); MPI_Allgather((void *)(cvtxdist+npes), 1, IDX_DATATYPE, (void *)cvtxdist, 1, IDX_DATATYPE, ctrl->comm); MAKECSR(i, npes, cvtxdist); #ifdef DEBUG_ORDER PrintVector(ctrl, npes+1, 0, cvtxdist, "cvtxdist"); #endif /************************************************************* * Construct the cmap vector **************************************************************/ cfirstvtx = cvtxdist[ctrl->mype]; /* Create the cmap of what you know so far locally */ for (cnvtxs=0, i=0; i<nvtxs; i++) { if (where[i] < nparts) { perm[cnvtxs] = perm[i]; cmap[i] = cfirstvtx + cnvtxs++; } } CommInterfaceData(ctrl, graph, cmap, wspace->indices, cmap+nvtxs); /************************************************************* * Finally, compact the graph **************************************************************/ newwhere = idxmalloc(cnvtxs, "CompactGraph: newwhere"); cnvtxs = l = 0; for (i=0; i<nvtxs; i++) { if (where[i] < nparts) { for (j=xadj[i]; j<xadj[i+1]; j++) { if (where[i] == where[ladjncy[j]]) { ladjncy[l] = cmap[ladjncy[j]]; adjwgt[l++] = adjwgt[j]; } #ifdef DEBUG_ORDER else if (where[ladjncy[j]] < nparts) printf("It seems that the separation has failed: %d %d\n", where[i], where[ladjncy[j]]); #endif } xadj[cnvtxs] = l; graph->vwgt[cnvtxs] = graph->vwgt[i]; newwhere[cnvtxs] = where[i]; cnvtxs++; } } for (i=cnvtxs; i>0; i--) xadj[i] = xadj[i-1]; xadj[0] = 0; GKfree((void **)&graph->match, (void **)&graph->cmap, (void **)&graph->lperm, (void **)&graph->where, (void **)&graph->label, (void **)&graph->rinfo, (void **)&graph->nrinfo, (void **)&graph->lpwgts, (void **)&graph->gpwgts, (void **)&graph->sepind, (void **)&graph->peind, (void **)&graph->sendptr, (void **)&graph->sendind, (void **)&graph->recvptr, (void **)&graph->recvind, (void **)&graph->imap, (void **)&graph->rlens, (void **)&graph->slens, (void **)&graph->rcand, (void **)&graph->pexadj, (void **)&graph->peadjncy, (void **)&graph->peadjloc, LTERM); graph->nvtxs = cnvtxs; graph->nedges = l; graph->gnvtxs = cvtxdist[npes]; idxcopy(npes+1, cvtxdist, graph->vtxdist); graph->where = newwhere; }
PetscErrorCode MatGetMultiProcBlock_MPIAIJ(Mat mat, MPI_Comm subComm, MatReuse scall,Mat *subMat) { PetscErrorCode ierr; Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; Mat_SeqAIJ *aijB = (Mat_SeqAIJ*)aij->B->data; PetscMPIInt commRank,subCommSize,subCommRank; PetscMPIInt *commRankMap,subRank,rank,commsize; PetscInt *garrayCMap,col,i,j,*nnz,newRow,newCol; PetscFunctionBegin; ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&commsize);CHKERRQ(ierr); ierr = MPI_Comm_size(subComm,&subCommSize);CHKERRQ(ierr); /* create subMat object with the relavent layout */ if (scall == MAT_INITIAL_MATRIX) { ierr = MatCreate(subComm,subMat);CHKERRQ(ierr); ierr = MatSetType(*subMat,MATMPIAIJ);CHKERRQ(ierr); ierr = MatSetSizes(*subMat,mat->rmap->n,mat->cmap->n,PETSC_DECIDE,PETSC_DECIDE);CHKERRQ(ierr); ierr = MatSetBlockSizesFromMats(*subMat,mat,mat);CHKERRQ(ierr); /* need to setup rmap and cmap before Preallocation */ ierr = PetscLayoutSetUp((*subMat)->rmap);CHKERRQ(ierr); ierr = PetscLayoutSetUp((*subMat)->cmap);CHKERRQ(ierr); } /* create a map of comm_rank from subComm to comm - should commRankMap and garrayCMap be kept for reused? */ ierr = MPI_Comm_rank(PetscObjectComm((PetscObject)mat),&commRank);CHKERRQ(ierr); ierr = MPI_Comm_rank(subComm,&subCommRank);CHKERRQ(ierr); ierr = PetscMalloc1(subCommSize,&commRankMap);CHKERRQ(ierr); ierr = MPI_Allgather(&commRank,1,MPI_INT,commRankMap,1,MPI_INT,subComm);CHKERRQ(ierr); /* Traverse garray and identify column indices [of offdiag mat] that should be discarded. For the ones not discarded, store the newCol+1 value in garrayCMap */ ierr = PetscCalloc1(aij->B->cmap->n,&garrayCMap);CHKERRQ(ierr); for (i=0; i<aij->B->cmap->n; i++) { col = aij->garray[i]; for (subRank=0; subRank<subCommSize; subRank++) { rank = commRankMap[subRank]; if ((col >= mat->cmap->range[rank]) && (col < mat->cmap->range[rank+1])) { garrayCMap[i] = (*subMat)->cmap->range[subRank] + col - mat->cmap->range[rank]+1; break; } } } if (scall == MAT_INITIAL_MATRIX) { /* Now compute preallocation for the offdiag mat */ ierr = PetscCalloc1(aij->B->rmap->n,&nnz);CHKERRQ(ierr); for (i=0; i<aij->B->rmap->n; i++) { for (j=aijB->i[i]; j<aijB->i[i+1]; j++) { if (garrayCMap[aijB->j[j]]) nnz[i]++; } } ierr = MatMPIAIJSetPreallocation(*(subMat),0,NULL,0,nnz);CHKERRQ(ierr); /* reuse diag block with the new submat */ ierr = MatDestroy(&((Mat_MPIAIJ*)((*subMat)->data))->A);CHKERRQ(ierr); ((Mat_MPIAIJ*)((*subMat)->data))->A = aij->A; ierr = PetscObjectReference((PetscObject)aij->A);CHKERRQ(ierr); } else if (((Mat_MPIAIJ*)(*subMat)->data)->A != aij->A) { PetscObject obj = (PetscObject)((Mat_MPIAIJ*)((*subMat)->data))->A; ierr = PetscObjectReference((PetscObject)obj);CHKERRQ(ierr); ((Mat_MPIAIJ*)((*subMat)->data))->A = aij->A; ierr = PetscObjectReference((PetscObject)aij->A);CHKERRQ(ierr); } /* Now traverse aij->B and insert values into subMat */ for (i=0; i<aij->B->rmap->n; i++) { newRow = (*subMat)->rmap->range[subCommRank] + i; for (j=aijB->i[i]; j<aijB->i[i+1]; j++) { newCol = garrayCMap[aijB->j[j]]; if (newCol) { newCol--; /* remove the increment */ ierr = MatSetValues(*subMat,1,&newRow,1,&newCol,(aijB->a+j),INSERT_VALUES);CHKERRQ(ierr); } } } /* assemble the submat */ ierr = MatAssemblyBegin(*subMat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); ierr = MatAssemblyEnd(*subMat,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); /* deallocate temporary data */ ierr = PetscFree(commRankMap);CHKERRQ(ierr); ierr = PetscFree(garrayCMap);CHKERRQ(ierr); if (scall == MAT_INITIAL_MATRIX) { ierr = PetscFree(nnz);CHKERRQ(ierr); } PetscFunctionReturn(0); }
/*-------------------------------------------------------------------------*/ int FTI_InitMpiICP(FTIT_configuration* FTI_Conf, FTIT_execution* FTI_Exec, FTIT_topology* FTI_Topo, FTIT_checkpoint* FTI_Ckpt, FTIT_dataset* FTI_Data) { int res; FTI_Print("I/O mode: MPI-IO.", FTI_DBUG); char str[FTI_BUFS], mpi_err[FTI_BUFS]; // enable collective buffer optimization MPI_Info info; MPI_Info_create(&info); MPI_Info_set(info, "romio_cb_write", "enable"); /* * update ckpt file name (neccessary for the restart!) * not very nice TODO we should think about another mechanism */ snprintf(FTI_Exec->meta[0].ckptFile, FTI_BUFS, "Ckpt%d-Rank%d.fti", FTI_Exec->ckptID, FTI_Topo->myRank); // TODO enable to set stripping unit in the config file (Maybe also other hints) // set stripping unit to 4MB MPI_Info_set(info, "stripping_unit", "4194304"); char gfn[FTI_BUFS], ckptFile[FTI_BUFS]; snprintf(ckptFile, FTI_BUFS, "Ckpt%d-mpiio.fti", FTI_Exec->ckptID); snprintf(gfn, FTI_BUFS, "%s/%s", FTI_Conf->gTmpDir, ckptFile); // open parallel file (collective call) MPI_File pfh; #ifdef LUSTRE if (FTI_Topo->splitRank == 0) { res = llapi_file_create(gfn, FTI_Conf->stripeUnit, FTI_Conf->stripeOffset, FTI_Conf->stripeFactor, 0); if (res) { char error_msg[FTI_BUFS]; error_msg[0] = 0; strerror_r(-res, error_msg, FTI_BUFS); snprintf(str, FTI_BUFS, "[Lustre] %s.", error_msg); FTI_Print(str, FTI_WARN); } else { snprintf(str, FTI_BUFS, "[LUSTRE] file:%s striping_unit:%i striping_factor:%i striping_offset:%i", ckptFile, FTI_Conf->stripeUnit, FTI_Conf->stripeFactor, FTI_Conf->stripeOffset); FTI_Print(str, FTI_DBUG); } } #endif res = MPI_File_open(FTI_COMM_WORLD, gfn, MPI_MODE_WRONLY|MPI_MODE_CREATE, info, &pfh); // check if successful if (res != 0) { errno = 0; int reslen; MPI_Error_string(res, mpi_err, &reslen); snprintf(str, FTI_BUFS, "unable to create file %s [MPI ERROR - %i] %s", gfn, res, mpi_err); FTI_Print(str, FTI_EROR); return FTI_NSCS; } MPI_Offset chunkSize = FTI_Exec->ckptSize; // collect chunksizes of other ranks MPI_Offset* chunkSizes = talloc(MPI_Offset, FTI_Topo->nbApprocs * FTI_Topo->nbNodes); MPI_Allgather(&chunkSize, 1, MPI_OFFSET, chunkSizes, 1, MPI_OFFSET, FTI_COMM_WORLD); // set file offset MPI_Offset offset = 0; int i; for (i = 0; i < FTI_Topo->splitRank; i++) { offset += chunkSizes[i]; } free(chunkSizes); FTI_Exec->iCPInfo.offset = offset; memcpy( FTI_Exec->iCPInfo.fh, &pfh, sizeof(FTI_MI_FH) ); MPI_Info_free(&info); return FTI_SCES; }
int Zoltan_Preprocess_Graph( ZZ *zz, /* Zoltan structure */ ZOLTAN_ID_PTR *global_ids, ZOLTAN_ID_PTR *local_ids, ZOLTAN_Third_Graph *gr, /* Graph for third part libs */ ZOLTAN_Third_Geom *geo, ZOLTAN_Third_Part *prt, ZOLTAN_Third_Vsize *vsp ) { static char *yo = "Zoltan_Preprocess_Graph"; int ierr; float *float_vwgt=NULL, *float_ewgts=NULL; char msg[256]; ZG *graph = &(gr->graph); int local; char add_obj_weight[MAX_PARAM_STRING_LEN+1]; ZOLTAN_TRACE_ENTER(zz, yo); /* Initialize all local pointers to NULL. This is necessary * because we free all non-NULL pointers upon errors. */ gr->vtxdist = gr->xadj = gr->adjncy = NULL; gr->vwgt = gr->ewgts = NULL; float_vwgt = float_ewgts = NULL; if (gr->obj_wgt_dim >= 0) { /* Check weight dimensions */ if (zz->Obj_Weight_Dim<0){ sprintf(msg, "Object weight dimension is %d, " "but should be >= 0. Using Obj_Weight_Dim = 0.", zz->Obj_Weight_Dim); ZOLTAN_PRINT_WARN(zz->Proc, yo, msg); gr->obj_wgt_dim = 0; } else { gr->obj_wgt_dim = zz->Obj_Weight_Dim; } } else gr->obj_wgt_dim = 0; if (gr->edge_wgt_dim >= 0) { if (zz->Edge_Weight_Dim<0){ sprintf(msg, "Edge weight dimension is %d, " "but should be >= 0. Using Edge_Weight_Dim = 0.", zz->Edge_Weight_Dim); ZOLTAN_PRINT_WARN(zz->Proc, yo, msg); gr->edge_wgt_dim = 0; } else if (zz->Edge_Weight_Dim>1){ ZOLTAN_PRINT_WARN(zz->Proc, yo, "This method does not support " "multidimensional edge weights. Using Edge_Weight_Dim = 1."); gr->edge_wgt_dim = 1; } else { gr->edge_wgt_dim = zz->Edge_Weight_Dim; } } else gr->edge_wgt_dim = 0; /* Default graph type is GLOBAL. */ /* Get parameter options shared by ParMetis and Jostle */ gr->check_graph = 1; /* default */ gr->scatter = 1; /* default */ gr->final_output = 0; strcpy(add_obj_weight, "NONE"); /* default */ Zoltan_Bind_Param(Graph_params, "CHECK_GRAPH", (void *) &gr->check_graph); Zoltan_Bind_Param(Graph_params, "SCATTER_GRAPH", (void *) &gr->scatter); Zoltan_Bind_Param(Graph_params, "FINAL_OUTPUT", (void *) &gr->final_output); Zoltan_Bind_Param(Graph_params, "ADD_OBJ_WEIGHT", (void *) add_obj_weight); Zoltan_Assign_Param_Vals(zz->Params, Graph_params, zz->Debug_Level, zz->Proc, zz->Debug_Proc); /* If reorder is true, we already have the id lists. Ignore weights. */ if ((*global_ids == NULL) || (!gr->id_known)){ int *input_part = NULL; ierr = Zoltan_Get_Obj_List(zz, &gr->num_obj, global_ids, local_ids, gr->obj_wgt_dim, &float_vwgt, &input_part); CHECK_IERR; if (prt) { prt->input_part = input_part; } else if (input_part) { /* Ordering, dont need part */ ZOLTAN_FREE(&input_part); } if (ierr){ /* Return error */ ZOLTAN_PARMETIS_ERROR(ierr, "Get_Obj_List returned error."); } } /* Build Graph for third party library data structures, or just get vtxdist. */ if (gr->get_data) { local = IS_LOCAL_GRAPH(gr->graph_type); ZOLTAN_FREE(&float_vwgt); ierr = Zoltan_ZG_Build (zz, graph, local); /* Normal graph */ CHECK_IERR; ierr = Zoltan_ZG_Export (zz, graph, &gr->num_obj, &gr->num_obj, &gr->obj_wgt_dim, &gr->edge_wgt_dim, &gr->vtxdist, &gr->xadj, &gr->adjncy, &gr->adjproc, &float_vwgt, &float_ewgts, NULL); /* TODO: support graph redistribution */ /* if (prt) */ /* ierr = Zoltan_ZG_Vertex_Info(zz, &graph, global_ids, &prt->input_part); */ /* else */ /* ierr = Zoltan_ZG_Vertex_Info(zz, &graph, global_ids, NULL); */ /* /\* Just to try *\/ */ /* if (prt) { */ /* prt->input_part = (int *)ZOLTAN_CALLOC(gr->num_obj, sizeof(int)); */ /* } */ if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN){ ZOLTAN_PARMETIS_ERROR(ierr, "Zoltan_Build_Graph returned error."); } } else{ /* Only geometry */ int i; /* No graph but still needs vtxdist*/ gr->vtxdist = (int*) ZOLTAN_MALLOC ((zz->Num_Proc+1)*sizeof(int)); if (gr->vtxdist == NULL) ZOLTAN_PARMETIS_ERROR(ZOLTAN_MEMERR, "Out of memory."); gr->vtxdist[0] = 0; MPI_Allgather(&gr->num_obj, 1, MPI_INT, gr->vtxdist+1, 1, MPI_INT, zz->Communicator); for (i=1 ; i <= zz->Num_Proc ; ++i) { gr->vtxdist[i] += gr->vtxdist[i-1]; } } if (prt) { prt->part_sizes = prt->input_part_sizes; /* ParMETIS needs prt->part to be allocated, even when num_obj=0. */ /* See Zoltan bug 4299. */ prt->part = (indextype *)ZOLTAN_MALLOC((gr->num_obj+1) * sizeof(indextype)); if (!prt->part) { /* Not enough memory */ ZOLTAN_PARMETIS_ERROR(ZOLTAN_MEMERR, "Out of memory."); } if (gr->num_obj >0) { memcpy (prt->part, prt->input_part, (gr->num_obj) * sizeof(indextype)); } else { prt->input_part = NULL; } } /* Convert from float. */ /* Get vertex weights if needed */ if (gr->obj_wgt_dim){ ierr = Zoltan_Preprocess_Scale_Weights (gr, float_vwgt, &gr->vwgt, gr->num_obj, gr->obj_wgt_dim, 1, zz, "vertex", gr->vtxdist[zz->Proc]); if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN){ /* Return error code */ ZOLTAN_PARMETIS_ERROR(ierr, "Error in scaling of weights."); } ZOLTAN_FREE(&float_vwgt); } if (strcasecmp(add_obj_weight, "NONE")){ if (Zoltan_Preprocess_Add_Weight(zz, gr, prt, add_obj_weight) != ZOLTAN_OK) ZOLTAN_PARMETIS_ERROR(ierr, "Error in adding vertex weights."); } /* Get edge weights if needed */ if (gr->get_data) gr->num_edges = gr->xadj[gr->num_obj]; else { gr->num_edges = 0; gr->edge_wgt_dim = 0; } if (gr->edge_wgt_dim){ ierr = Zoltan_Preprocess_Scale_Weights (gr, float_ewgts, &gr->ewgts, gr->num_edges, gr->edge_wgt_dim, 1, zz, "edge", 0); if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN){ /* Return error code */ ZOLTAN_PARMETIS_ERROR(ierr, "Error in scaling of weights."); } if (!gr->final_output) { ZOLTAN_FREE(&float_ewgts); } else gr->float_ewgts = float_ewgts; } else { ZOLTAN_FREE(&float_ewgts); } if (geo){ ierr = Zoltan_Preprocess_Extract_Geom (zz, global_ids, local_ids, gr, geo); if (ierr) { ZOLTAN_PARMETIS_ERROR(ZOLTAN_FATAL, "Error returned from Zoltan_Preprocess_Extract_Geom"); } } if (vsp) { ierr = Zoltan_Preprocess_Extract_Vsize (zz, global_ids, local_ids, gr, vsp); if (ierr) { ZOLTAN_PARMETIS_ERROR(ZOLTAN_FATAL, "Error returned from Zoltan_Preprocess_Extract_Vsize"); } } /* Scatter graph? * If data distribution is highly imbalanced, it is better to * redistribute the graph data structure before calling ParMetis. * After partitioning, the results must be mapped back. */ if (gr->scatter < gr->scatter_min) gr->scatter = gr->scatter_min; if (gr->scatter>0) { ierr = Zoltan_Preprocess_Scatter_Graph (zz, gr, prt, geo, vsp); if (ierr != ZOLTAN_OK && ierr != ZOLTAN_WARN) { ZOLTAN_PARMETIS_ERROR(ZOLTAN_FATAL, "Error returned from Zoltan_Preprocess_Scatter_Graph"); } } /* Verify that graph is correct */ if (gr->get_data){ int flag; if (zz->Debug_Level >= ZOLTAN_DEBUG_ALL) flag = 2; /* Maximum output level */ else flag = 1; /* Medium output level */ ierr = Zoltan_Verify_Graph(zz->Communicator, gr->vtxdist, gr->xadj, gr->adjncy, gr->vwgt, gr->ewgts, gr->obj_wgt_dim, gr->edge_wgt_dim, gr->graph_type, gr->check_graph, flag); } End: ZOLTAN_TRACE_EXIT(zz, yo); return (ierr); }
static int gather_and_build_remap( ZZ *zz, int *new_map, /* Upon return, flag indicating whether parts assignments were changed due to remap. */ int HEcnt, /* # of HEs allocated. */ int *HEinfo /* Array of HE info; for each HE, two pins and one edge weight. Stored as a single vector to minimize communication calls. */ ) { char *yo = "gather_and_remap"; int ierr = ZOLTAN_OK; int i, uidx, tmp; int *each_size = NULL; /* sizes (# HEs * HEINFO_ENTRIES) for each proc */ int *recvbuf = NULL; /* Receive buffer for gatherv */ int *displs = NULL; /* Displacement buffer for gatherv */ int send_size; /* Local # HEs * HEINFO_ENTRIES */ int total_size; /* Total # ints in gatherv */ int total_HEcnt; /* Total (across all procs) number of HEs. */ int max0, max1; /* Max values of pin 0 and pin 1 for each HE. */ int *match = NULL; /* Vector describing the matching. match[i] = j ==> match[j] = i ==> vertices i and j are matched. */ int *used = NULL; /* Vector indicating which partitions are used in the matching. */ int limit; /* Maximum number of matches that are allowed */ HGraph hg; /* Hypergraph for matching */ float before_remap = 0, /* Amount of data that overlaps between old and */ after_remap = 0; /* new decomposition before and after remapping, respectively. */ float with_oldremap = 0; /* Amount of data that overlaps between old and new decomposition using the OldRemap vector (remapping from the previous decomposition). */ /* Gather HEs from each processor into a local complete HG. */ each_size = (int *) ZOLTAN_MALLOC(zz->Num_Proc * sizeof(int)); if (!each_size) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } send_size = HEcnt * HEINFO_ENTRIES; MPI_Allgather(&send_size, 1, MPI_INT, each_size, 1, MPI_INT, zz->Communicator); for (total_size = 0, i = 0; i < zz->Num_Proc; i++) { total_size += each_size[i]; } recvbuf = (int *) ZOLTAN_MALLOC((zz->Num_Proc + total_size) * sizeof(int)); displs = recvbuf + total_size; if (!recvbuf) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } displs[0] = 0; for (i = 1; i < zz->Num_Proc; i++) displs[i] = displs[i-1] + each_size[i-1]; MPI_Allgatherv(HEinfo, send_size, MPI_INT, recvbuf, each_size, displs, MPI_INT, zz->Communicator); total_HEcnt = total_size / HEINFO_ENTRIES; for (max0 = -1, max1 = -1, i = 0; i < total_HEcnt; i++) { tmp = i * HEINFO_ENTRIES; if (recvbuf[tmp] > max0) max0 = recvbuf[tmp]; if (recvbuf[tmp+1] > max1) max1 = recvbuf[tmp+1]; } /* Increment max0 and max1 so that they are the maximum number of unique pin values for pin0 and pin1 respectively; i.e., allow pin value == 0. */ max0++; max1++; /* Sanity check */ /* Ideally, max1 should equal LB.Num_Global_Parts, but ParMETIS3 sometimes * does not return the correct number of non-empty partitions, allowing * max1 to be less than LB.Num_Global_Parts. * (e.g., ewgt.adaptive-partlocal1-v3.4.?). */ if (max1 > zz->LB.Num_Global_Parts) ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Unexpected value for max1."); /* Set up global HG */ Zoltan_HG_HGraph_Init(&hg); if (total_HEcnt) { hg.nVtx = max0 + zz->LB.Num_Global_Parts; hg.nEdge = total_HEcnt; hg.nPins = total_HEcnt * 2; /* two pins per HE */ hg.EdgeWeightDim = 1; hg.ewgt = (float *) ZOLTAN_MALLOC(total_HEcnt * sizeof(float)); hg.hindex = (int *) ZOLTAN_MALLOC((total_HEcnt + 1) * sizeof(int)); hg.hvertex = (int *) ZOLTAN_MALLOC((hg.nPins) * sizeof(int)); if (!hg.ewgt || !hg.hindex || !hg.hvertex) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } for (i = 0; i < total_HEcnt; i++) { tmp = i * HEINFO_ENTRIES; hg.hindex[i] = i+i; hg.hvertex[i+i] = recvbuf[tmp]; hg.hvertex[i+i+1] = recvbuf[tmp+1]+max0; hg.ewgt[i] = recvbuf[tmp+2]; } hg.hindex[total_HEcnt] = total_HEcnt + total_HEcnt; ierr = Zoltan_HG_Create_Mirror(zz, &hg); if (ierr < 0) goto End; } before_remap = measure_stays(zz, &hg, max0, NULL, "BEFORE"); /* Compute the amount of overlap when using the old remap vector. */ with_oldremap = measure_stays(zz, &hg, max0, zz->LB.OldRemap, "WITHOLD"); /* Do matching */ match = (int *) ZOLTAN_CALLOC(hg.nVtx + zz->LB.Num_Global_Parts, sizeof(int)); used = match + hg.nVtx; if (hg.nVtx && !match) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } /* Max # matches allowed */ limit = (max0 < zz->LB.Num_Global_Parts ? max0 : zz->LB.Num_Global_Parts); do_match(zz, &hg, match, limit); /* Build remapping vector, if non-trivial matching was returned. */ *new_map = 0; for (i = 0; i < zz->LB.Num_Global_Parts; i++) if (match[i+max0] != i+max0) { *new_map = 1; break; } if (*new_map) { zz->LB.Remap = (int *) ZOLTAN_MALLOC(zz->LB.Num_Global_Parts * sizeof(int)); if (!(zz->LB.Remap)) { ZOLTAN_PRINT_ERROR(zz->Proc, yo, "Memory error."); ierr = ZOLTAN_MEMERR; goto End; } /* First, process all parts that were matched. Mark matched parts as used.*/ for (i = 0; i < zz->LB.Num_Global_Parts; i++) { zz->LB.Remap[i] = -1; tmp = match[i+max0]; if (tmp != i+max0) { zz->LB.Remap[i] = tmp; used[tmp] = 1; } } /* Second, process unmatched parts; if possible, keep same part number. */ for (i = 0; i < zz->LB.Num_Global_Parts; i++) { if (zz->LB.Remap[i] > -1) continue; /* Already processed part i */ /* match[i+max0] == i+max0 */ if (!used[i]) { /* Keep the same part number if it is not used */ zz->LB.Remap[i] = i; used[i] = 1; } } /* Third, process remaining unmatched parts; assign them to unused partitions.*/ for (uidx = 0, i = 0; i < zz->LB.Num_Global_Parts; i++) { if (zz->LB.Remap[i] > -1) continue; /* Already processed part i */ /* match[i+max0] == i+max0 */ while (used[uidx]) uidx++; /* Find next unused partition */ zz->LB.Remap[i] = uidx; used[uidx] = 1; } } if (*new_map) after_remap = measure_stays(zz, &hg, max0, zz->LB.Remap, "AFTER "); if ((before_remap >= after_remap) && (before_remap >= with_oldremap)) { /* No benefit from remapping; don't keep it! */ ZOLTAN_FREE(&zz->LB.Remap); ZOLTAN_FREE(&zz->LB.OldRemap); *new_map = 0; } else if (with_oldremap >= after_remap) { /* The old remap vector is better than the new one; keep the old one. */ ZOLTAN_FREE(&zz->LB.Remap); zz->LB.Remap = zz->LB.OldRemap; zz->LB.OldRemap = NULL; *new_map = 1; } else { /* Going to use the new remap vector; free the old one. */ ZOLTAN_FREE(&zz->LB.OldRemap); } if (zz->Debug_Level >= ZOLTAN_DEBUG_ALL && zz->Proc == zz->Debug_Proc && zz->LB.Remap) for (i = 0; i < zz->LB.Num_Global_Parts; i++) printf("%d REMAP Part %d to Part %d\n", zz->Proc, i, zz->LB.Remap[i]); End: ZOLTAN_FREE(&match); ZOLTAN_FREE(&each_size); ZOLTAN_FREE(&recvbuf); Zoltan_HG_HGraph_Free(&hg); return ierr; }
/*************************************************** **** K-SPACE CONTRIBUTION ***************************************************/ void ewald_compute_kspace(ewald_data_struct* d, fcs_int num_particles, fcs_float *positions, fcs_float *charges, fcs_float *fields, fcs_float *potentials) { FCS_INFO(fprintf(stderr, "ewald_compute_kspace started...\n")); /* DISTRIBUTE ALL PARTICLE DATA TO ALL NODES */ /* Gather all particle numbers */ int node_num_particles = num_particles; int node_particles[d->comm_size]; int node_particles3[d->comm_size]; int displs[d->comm_size]; int displs3[d->comm_size]; fcs_int total_particles; /* printf("%d: num_particles=%d\n", d->comm_rank, num_particles); */ MPI_Allgather(&node_num_particles, 1, MPI_INT, node_particles, 1, MPI_INT, d->comm); /* compute displacements for MPI_Gatherv */ total_particles = node_particles[0]; node_particles3[0] = node_particles[0]*3; displs[0] = 0; displs3[0] = 0; for (fcs_int i=1; i < d->comm_size; i++) { total_particles += node_particles[i]; node_particles3[i] = node_particles[i]*3; displs[i] = displs[i-1] + node_particles[i-1]; displs3[i] = displs3[i-1] + node_particles3[i-1]; } fcs_float *all_positions, *all_charges, *node_fields, *node_potentials; all_positions = malloc(sizeof(fcs_float) * 3 * total_particles); all_charges = malloc(sizeof(fcs_float) * total_particles); node_fields = malloc(sizeof(fcs_float) * 3 * total_particles); node_potentials = malloc(sizeof(fcs_float) * total_particles); /* gather all particle data at all nodes */ MPI_Allgatherv(positions, num_particles*3, FCS_MPI_FLOAT, all_positions, node_particles3, displs3, FCS_MPI_FLOAT, d->comm); MPI_Allgatherv(charges, num_particles, FCS_MPI_FLOAT, all_charges, node_particles, displs, FCS_MPI_FLOAT, d->comm); /* for (fcs_int i=0; i < total_particles; i++) { */ /* printf("%d: all_positions[%d]={%lf, %lf, %lf}\n", d->comm_rank, i, all_positions[i*3], all_positions[3*i+1], all_positions[3*i+2]); */ /* printf("%d: all_charges[%d]=%lf\n", d->comm_rank, i, all_charges[i]); */ /* } */ /* INIT ALGORITHM */ /* init fields and potentials */ if (fields != NULL) { for (fcs_int i=0; i < total_particles; i++) { node_fields[3*i] = 0.0; node_fields[3*i+1] = 0.0; node_fields[3*i+2] = 0.0; } } if (potentials != NULL) for (fcs_int i=0; i < total_particles; i++) node_potentials[i] = 0.0; /* COMPUTE FAR FIELDS */ /* evenly distribute the k-vectors onto all tasks */ fcs_int num_k_per_dir = 2*d->kmax+1; fcs_int num_k = num_k_per_dir * num_k_per_dir * num_k_per_dir; for (int k_ind = d->comm_rank; k_ind < num_k; k_ind += d->comm_size) { /* compute fields and potentials */ fcs_int nx = k_ind % num_k_per_dir - d->kmax; fcs_int ny = k_ind % (num_k_per_dir*num_k_per_dir) / num_k_per_dir - d->kmax; fcs_int nz = k_ind / (num_k_per_dir*num_k_per_dir) - d->kmax; if (nx*nx + ny*ny + nz*nz <= d->kmax*d->kmax) { /* system length vector L_vec */ const fcs_float Lx = d->box_l[0]; const fcs_float Ly = d->box_l[1]; const fcs_float Lz = d->box_l[2]; /* reciprocal vector k_vec */ const fcs_float kx = 2.0*M_PI*nx / Lx; const fcs_float ky = 2.0*M_PI*ny / Ly; const fcs_float kz = 2.0*M_PI*nz / Lz; /* reciprocal charge density rhohat */ fcs_float rhohat_re = 0.0; fcs_float rhohat_im = 0.0; /* compute Deserno, Holm (1998) eq. (8) */ for (fcs_int i=0; i < total_particles; i++) { /* charge q */ const fcs_float q = all_charges[i]; if (!fcs_float_is_zero(q)) { /* particle position r_vec */ const fcs_float rx = all_positions[3*i]; const fcs_float ry = all_positions[3*i+1]; const fcs_float rz = all_positions[3*i+2]; /* compute k_vec*r_vec */ fcs_float kr = kx*rx + ky*ry + kz*rz; /* rhohat = qi * exp(-i*k_vec*r_vec) */ rhohat_re += q * cos(kr); rhohat_im += q * -sin(kr); } } /* FCS_DEBUG(fprintf(stderr, " n_vec= (%d, %d, %d) rhohat_re=%e rhohat_im=%e\n", nx, ny, nz, rhohat_re, rhohat_im));*/ /* fetch influence function */ fcs_float g = d->G[linindex(abs(nx), abs(ny), abs(nz), d->kmax)]; for (fcs_int i=0; i < total_particles; i++) { /* particle position r_vec */ const fcs_float rx = all_positions[3*i]; const fcs_float ry = all_positions[3*i+1]; const fcs_float rz = all_positions[3*i+2]; /* compute k_vec*r_vec */ fcs_float kr = kx*rx + ky*ry + kz*rz; if (fields != NULL) { /* compute field at position of particle i compare to Deserno, Holm (1998) eq. (15) */ fcs_float fak1 = g * (rhohat_re*sin(kr) + rhohat_im*cos(kr)); node_fields[3*i] += kx * fak1; node_fields[3*i+1] += ky * fak1; node_fields[3*i+2] += kz * fak1; } if (potentials != NULL) { /* compute potential at position of particle i compare to Deserno, Holm (1998) eq. (9) */ node_potentials[i] += g * (rhohat_re*cos(kr) - rhohat_im*sin(kr)); } } } } /* printf("%d: node_fields[0]=%lf\n", d->comm_rank, node_fields[0]); */ /* REDISTRIBUTE COMPUTED FAR FIELDS AND POTENTIALS */ if (fields != NULL) { fcs_float all_fields[total_particles*3]; for (fcs_int pid=0; pid < total_particles; pid++) { all_fields[3*pid] = 0.0; all_fields[3*pid+1] = 0.0; all_fields[3*pid+2] = 0.0; } /* Combine all fields on master */ MPI_Reduce(node_fields, all_fields, total_particles*3, FCS_MPI_FLOAT, MPI_SUM, 0, d->comm); /* if (d->comm_rank == 0) */ /* printf("%d: all_fields[0]=%lf\n", d->comm_rank, all_fields[0]); */ /* Scatter the fields to the task that holds the particle */ MPI_Scatterv(all_fields, node_particles3, displs3, FCS_MPI_FLOAT, fields, num_particles*3, FCS_MPI_FLOAT, 0, d->comm); } if (potentials != NULL) { fcs_float all_potentials[total_particles]; for (fcs_int pid=0; pid < total_particles; pid++) all_potentials[pid] = 0.0; /* Combine all potentials on master */ MPI_Reduce(node_potentials, all_potentials, total_particles, FCS_MPI_FLOAT, MPI_SUM, 0, d->comm); /* Scatter the potentials to the task that holds the particle */ MPI_Scatterv(all_potentials, node_particles, displs, FCS_MPI_FLOAT, potentials, num_particles, FCS_MPI_FLOAT, 0, d->comm); /* subtract self potential */ FCS_INFO(fprintf(stderr, " subtracting self potential...\n")); for (fcs_int i=0; i < num_particles; i++) { potentials[i] -= charges[i] * M_2_SQRTPI * d->alpha; } } free(all_positions); free(all_charges); free(node_fields); free(node_potentials); /* now each task should have its far field components */ FCS_INFO(fprintf(stderr, "ewald_compute_kspace finished.\n")); }
/** * if A[n*n],B[n*1],C[n*1] is Matrices and AC=B then C=A^-1*B * get input one Matrix of (A^-1*B) of size [n*(n+1)] * scatter the Matrix between process * collect B in each Process * GatherAll B in each Process * do Multiblication **/ int main(int argc, char *argv[]) { int my_rank; int size ; int tag; int source; int dest ; MPI_Status status; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD , &my_rank); int input = 0 ; int A_r = 0, A_c = 0, B_r = 0, B_c = 0; int *A,*B,*C; int N ; if(my_rank==0){ printf("Welcome to vector Matrix multiplication program! Allgather\n"); FILE *file= fopen("test.txt","r+"); //readFromFile if(file!=NULL){ fscanf(file,"%d",&N); A_r = N ;A_c = N ; B_r = N ;B_c = 1 ; // check dimnsions of matrix is divisable by # of process if (A_c%size != 0) { printf("# of MPI tasks Must Divisble by Matrix dimentions. Quitting...\n"); MPI_Abort(MPI_COMM_WORLD, 0); exit(1); } // check that A [n*n], B [n*1] if(A_c==B_r && A_c==A_r && B_c == 1){ arr_alloc_file_input(file,&A,N,(N+1)); arr_alloc(&C,A_r,B_c); }else{ printf("not Valid Matrix dimentions"); MPI_Abort(MPI_COMM_WORLD, 0); exit(1); } fclose(file); }else{printf("unable to open file");} printf("------ A ---------\n"); printMatrix(A , N ,N+1); //scanf("%d",&N); } /* Bcast dimentions*/ MPI_Bcast (&A_c, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast (&A_r, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast (&B_c, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast (&B_r, 1, MPI_INT, 0, MPI_COMM_WORLD); // allocations if (my_rank!=0){arr_alloc(&A,A_r,(A_r+1));} // all process allocate B to make GatherAll on it arr_alloc(&B,B_r,B_c); // n is dimention int n = A_r ; /** * Scatter A smallA and collect smallB from smallA * * smallA.size = nRows*n * nRows = n/size ; * * smallB.size = n/size ; **/ //allocate int *smallA , *smallB; int nRows = n/size ; int smallA_size = nRows*(n+1); int smallB_size = n/size ; arr_alloc(&smallA, smallA_size, 1); arr_alloc(&smallB, smallB_size, 1); //Scatter A MPI_Scatter(A,smallA_size,MPI_INT,smallA,smallA_size,MPI_INT,0,MPI_COMM_WORLD); // collect smallB in each process // index : smallB index in smallA int j , index=n ; for ( j = 0; j <= nRows; ++j) { smallB[j] = smallA[index]; index+= (n+1) ; } // printf("\n---------P%d-----smallA-------\n",my_rank); // printMatrix(smallA, 1, smallA_size); // printf("\n---------P%d-----smallB-------\n",my_rank); // printMatrix(smallB, 1, smallB_size); /** * make each process AllGather it smallB then all process have the B * prebare matrix B to AllGather the smallB * * prebare matrix smallC to store the result * smallC.size = smallB.size = n/size ; * **/ // allocation int *smallC ,smallC_size=n/size; arr_alloc(&smallC, smallC_size, 1); arr_init(smallC, smallC_size, 1, 0); MPI_Allgather( smallB , smallB_size,MPI_INT,B,smallB_size,MPI_INT,MPI_COMM_WORLD); if (my_rank==0) { printf("\n---------B after Allgather-------\n",my_rank); printMatrix(B, 1, n); } j = 0 ; for ( j ; j < nRows; ++j) // for each row in smallA { int sum = 0 ; int j2 = 0 ; for ( j2 = 0; j2 < n; ++j2) { sum+=(smallA[j2+(j*(n+1))] * B[j2]); /* smalA[j2+(j*(n+1))] j2 -> go in cloumn (j*(n+1)) -> go in row and skip merged B */ //printf("P%d: [%d - %d]\n",my_rank,smallA[j2+(j*(n+1))], B[j2] ); } smallC[j] = sum ; } // collect smallC MPI_Gather(smallC , smallC_size , MPI_INT , C , smallC_size , MPI_INT , 0 , MPI_COMM_WORLD); if (my_rank==0) { printf("------ Final C -------\n"); printMatrix( C, A_r,B_c); } MPI_Finalize(); return 0; }
PetscErrorCode ConvertMatToMatrix(MPI_Comm comm, Mat A,Mat AT,matrix **B) { matrix *M; int i,j,col; int row_indx; int len,pe,local_indx,start_indx; int *mapping; PetscErrorCode ierr; const int *cols; const double *vals; int n,mnl,nnl,nz,rstart,rend; PetscMPIInt size,rank; struct compressed_lines *rows; PetscFunctionBegin; ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ierr = MatGetSize(A,&n,&n);CHKERRQ(ierr); ierr = MatGetLocalSize(A,&mnl,&nnl);CHKERRQ(ierr); /* not sure why a barrier is required. commenting out ierr = MPI_Barrier(comm);CHKERRQ(ierr); */ M = new_matrix((SPAI_Comm)comm); M->n = n; M->bs = 1; M->max_block_size = 1; M->mnls = (int*)malloc(sizeof(int)*size); M->start_indices = (int*)malloc(sizeof(int)*size); M->pe = (int*)malloc(sizeof(int)*n); M->block_sizes = (int*)malloc(sizeof(int)*n); for (i=0; i<n; i++) M->block_sizes[i] = 1; ierr = MPI_Allgather(&mnl,1,MPI_INT,M->mnls,1,MPI_INT,comm);CHKERRQ(ierr); M->start_indices[0] = 0; for (i=1; i<size; i++) M->start_indices[i] = M->start_indices[i-1] + M->mnls[i-1]; M->mnl = M->mnls[M->myid]; M->my_start_index = M->start_indices[M->myid]; for (i=0; i<size; i++) { start_indx = M->start_indices[i]; for (j=0; j<M->mnls[i]; j++) M->pe[start_indx+j] = i; } if (AT) { M->lines = new_compressed_lines(M->mnls[rank],1);CHKERRQ(ierr); } else { M->lines = new_compressed_lines(M->mnls[rank],0);CHKERRQ(ierr); } rows = M->lines; /* Determine the mapping from global indices to pointers */ ierr = PetscMalloc1(M->n,&mapping);CHKERRQ(ierr); pe = 0; local_indx = 0; for (i=0; i<M->n; i++) { if (local_indx >= M->mnls[pe]) { pe++; local_indx = 0; } mapping[i] = local_indx + M->start_indices[pe]; local_indx++; } /*********************************************************/ /************** Set up the row structure *****************/ /*********************************************************/ ierr = MatGetOwnershipRange(A,&rstart,&rend);CHKERRQ(ierr); for (i=rstart; i<rend; i++) { row_indx = i - rstart; ierr = MatGetRow(A,i,&nz,&cols,&vals);CHKERRQ(ierr); /* allocate buffers */ rows->ptrs[row_indx] = (int*)malloc(nz*sizeof(int)); rows->A[row_indx] = (double*)malloc(nz*sizeof(double)); /* copy the matrix */ for (j=0; j<nz; j++) { col = cols[j]; len = rows->len[row_indx]++; rows->ptrs[row_indx][len] = mapping[col]; rows->A[row_indx][len] = vals[j]; } rows->slen[row_indx] = rows->len[row_indx]; ierr = MatRestoreRow(A,i,&nz,&cols,&vals);CHKERRQ(ierr); } /************************************************************/ /************** Set up the column structure *****************/ /*********************************************************/ if (AT) { for (i=rstart; i<rend; i++) { row_indx = i - rstart; ierr = MatGetRow(AT,i,&nz,&cols,&vals);CHKERRQ(ierr); /* allocate buffers */ rows->rptrs[row_indx] = (int*)malloc(nz*sizeof(int)); /* copy the matrix (i.e., the structure) */ for (j=0; j<nz; j++) { col = cols[j]; len = rows->rlen[row_indx]++; rows->rptrs[row_indx][len] = mapping[col]; } ierr = MatRestoreRow(AT,i,&nz,&cols,&vals);CHKERRQ(ierr); } } ierr = PetscFree(mapping);CHKERRQ(ierr); order_pointers(M); M->maxnz = calc_maxnz(M); *B = M; PetscFunctionReturn(0); }
int main(int argc, char **argv) { const int MAX_ITER = 20; const double TOL = 1e-12; int rank; int size; int P = 8; // number of blocks to update P <= size /* ----------------------------------- mode controls the selection schemes, mode =0, fixed P mode =1, dynamic update P ----------------------------------*/ int mode=1; // number of processors used to update each time double lambda = 0.1; srand (time(NULL)); MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); // Determine current running process MPI_Comm_size(MPI_COMM_WORLD, &size); // Total number of processes // data directory (you need to change the path to your own data directory) char* dataCenterDir = "../Data/Gaussian"; char* big_dir; if(argc==2) big_dir = argv[1]; else big_dir = "big1"; /* Read in local data */ FILE *f, *test; int m, n, j; int row, col; double entry, startTime, endTime; double total_start_time, total_end_time; /* * Subsystem n will look for files called An.dat and bn.dat * in the current directory; these are its local data and do not need to be * visible to any other processes. Note that * m and n here refer to the dimensions of the *local* coefficient matrix. */ /* ------------ Read in A ------------*/ if(rank ==0){ printf("=============================\n"); printf("| Start to load data! |\n"); printf("=============================\n"); } char s[100]; sprintf(s, "%s/%s/A%d.dat",dataCenterDir,big_dir, rank + 1); printf("[%d] reading %s\n", rank, s); f = fopen(s, "r"); if (f == NULL) { printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s); exit(EXIT_FAILURE); } mm_read_mtx_array_size(f, &m, &n); gsl_matrix *A = gsl_matrix_calloc(m, n); for (int i = 0; i < m*n; i++) { row = i % m; col = floor(i/m); fscanf(f, "%lf", &entry); gsl_matrix_set(A, row, col, entry); } fclose(f); /* ------------ Read in b -------------*/ sprintf(s, "%s/%s/b.dat", dataCenterDir, big_dir); printf("[%d] reading %s\n", rank, s); f = fopen(s, "r"); if (f == NULL) { printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s); exit(EXIT_FAILURE); } mm_read_mtx_array_size(f, &m, &n); gsl_vector *b = gsl_vector_calloc(m); for (int i = 0; i < m; i++) { fscanf(f, "%lf", &entry); gsl_vector_set(b, i, entry); } fclose(f); /* ------------ Read in xs ------------*/ sprintf(s, "%s/%s/xs%d.dat", dataCenterDir, big_dir, rank + 1); printf("[%d] reading %s\n", rank, s); f = fopen(s, "r"); if (f == NULL) { printf("[%d] ERROR: %s does not exist, exiting.\n", rank, s); exit(EXIT_FAILURE); } mm_read_mtx_array_size(f, &m, &n); gsl_vector *xs = gsl_vector_calloc(m); for (int i = 0; i < m; i++) { fscanf(f, "%lf", &entry); gsl_vector_set(xs, i, entry); } fclose(f); m = A->size1; n = A->size2; MPI_Barrier(MPI_COMM_WORLD); /*---------------------------------------- * These are all variables related to GRock ----------------------------------------*/ struct value table[size]; gsl_vector *x = gsl_vector_calloc(n); gsl_vector *As = gsl_vector_calloc(n); gsl_vector *invAs = gsl_vector_calloc(n); gsl_vector *local_b = gsl_vector_calloc(m); gsl_vector *beta = gsl_vector_calloc(n); gsl_vector *tmp = gsl_vector_calloc(n); gsl_vector *d = gsl_vector_calloc(n); gsl_vector *absd = gsl_vector_calloc(n); gsl_vector *oldx = gsl_vector_calloc(n); gsl_vector *tmpx = gsl_vector_calloc(n); gsl_vector *z = gsl_vector_calloc(m); gsl_vector *tmpz = gsl_vector_calloc(m); gsl_vector *Ax = gsl_vector_calloc(m); gsl_vector *Atmpx = gsl_vector_calloc(m); gsl_vector *xdiff = gsl_vector_calloc(n); gsl_permutation *idx = gsl_permutation_calloc(n); double send[1]; double recv[1]; double err; int num_upd = (int)(n*0.08); double sigma = 0.01; double xs_local_nrm[1], xs_nrm[1]; double local_old_obj, global_old_obj, local_new_obj, global_new_obj; //calculate the 2 norm of xs xs_local_nrm[0] = gsl_blas_dnrm2(xs); xs_local_nrm[0] *=xs_local_nrm[0]; MPI_Allreduce(xs_local_nrm, xs_nrm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); xs_nrm[0] = sqrt(xs_nrm[0]); // evaluate the two norm of the columns of A for(j=0;j<n;j++){ gsl_vector_view column = gsl_matrix_column(A, j); double d; d = gsl_blas_dnrm2(&column.vector); gsl_vector_set(As, j, d*d); gsl_vector_set(invAs, j, 1./(d*d)); } if (rank == 0) { printf("=============================\n"); printf("|GRock start to solve Lasso!|\n"); printf("|---------------------------|\n"); printf("|lambda=%1.2f, m=%d, n=%d |\n", lambda, m, n*size); if(mode==1) printf("| Mode: dynamic update P. |\n"); else printf("| Mode: fixed update P |\n"); printf("=============================\n"); printf("%3s %8s %8s %5s\n", "iter", "rel_err", "obj", "P"); startTime = MPI_Wtime(); sprintf(s, "results/test%d.m", size); test = fopen(s, "w"); fprintf(test,"res = [ \n"); } /* Main BCD loop */ total_start_time = MPI_Wtime(); int iter = 0; while (iter < MAX_ITER) { startTime = MPI_Wtime(); /*---------- restore the old x ------------*/ gsl_vector_memcpy(oldx, x); /*------- calculate local_b = b - sum_{j \neq i} Aj*xj--------- */ gsl_blas_dgemv(CblasNoTrans, 1, A, x, 0, Ax); // Ax = A * x MPI_Allreduce(Ax->data, z->data, m, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); gsl_vector_sub(z, b); // z = Ax - b gsl_vector_memcpy(local_b, Ax); gsl_vector_sub(local_b, z); /* -------calculate beta ------------------*/ gsl_blas_dgemv(CblasTrans, -1, A, z, 0, beta); // beta = A'(b - Ax) + ||A.s||^2 * xs gsl_vector_memcpy(tmp, As); pointwise(tmp, x, n); gsl_vector_add(beta, tmp); shrink(beta, lambda); // x = 1/|xs|^2 * shrink(beta, lambda) gsl_vector_memcpy(x, beta); pointwise(x, invAs, n); /* ------calcuate proposed decrease -------- */ gsl_vector_memcpy(d,x); gsl_vector_sub(d, oldx); if(mode ==1){ gsl_vector_memcpy(absd, d); abs_vector(absd, n); // sort the local array d gsl_vector_scale(absd, -1.0); gsl_sort_vector_index(idx, absd); // printf("|d(0)| = %lf, |d(1)| = %lf \n", gsl_vector_get(absd,0), gsl_vector_get(absd, 3)); // calculate current objective value; local_old_obj = objective(oldx, lambda, z, size); MPI_Allreduce(&local_old_obj, &global_old_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); num_upd = fmin(num_upd+1, (int)(0.1*n)); gsl_vector_memcpy(tmpx, oldx); int upd_idx; double local_delta = 0, delta=0.0; for(int i=0; i<num_upd; i++){ upd_idx = gsl_permutation_get(idx, i); // printf("%d\n", upd_idx); gsl_vector_set(tmpx, upd_idx, gsl_vector_get(x, upd_idx)); local_delta += gsl_vector_get(d, upd_idx) * gsl_vector_get(d, upd_idx); } MPI_Allreduce(&local_delta, &delta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); gsl_blas_dgemv(CblasNoTrans, 1, A, tmpx, 0, Atmpx); // Ax = A * x MPI_Allreduce(Atmpx->data, tmpz->data, m, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); gsl_vector_sub(tmpz, b); // z = Ax - b local_new_obj = objective(tmpx, lambda, tmpz, size); MPI_Allreduce(&local_new_obj, &global_new_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); while(global_new_obj - global_old_obj> -sigma * delta){ num_upd = fmax(num_upd-1, 1); for(int i=0; i<num_upd; i++){ upd_idx = gsl_permutation_get(idx, i); gsl_vector_set(tmpx, upd_idx, gsl_vector_get(x, upd_idx)); local_delta += gsl_vector_get(d, upd_idx) * gsl_vector_get(d, upd_idx); } MPI_Allreduce(&delta, &local_delta, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); gsl_blas_dgemv(CblasNoTrans, 1, A, tmpx, 0, Atmpx); // Ax = A * x MPI_Allreduce(Atmpx->data, tmpz->data, m, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); gsl_vector_sub(tmpz, b); // z = Ax - b local_new_obj = objective(tmpx, lambda, tmpz, size); MPI_Allreduce(&local_new_obj, &global_new_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); if(num_upd==1) break; } gsl_vector_memcpy(x, tmpx); } if(mode==0){ CBLAS_INDEX_t id = gsl_blas_idamax(d); double *store = (double*)calloc(size, sizeof(double)); double foo[1]; foo[0] = gsl_vector_get(d,id); MPI_Allgather(foo, 1, MPI_DOUBLE, store, 1, MPI_DOUBLE, MPI_COMM_WORLD); for(int i=0;i<size;i++){ table[i].ID = i; table[i].data = fabs(store[i]); } // quick sort to decide which block to update qsort((void *) & table, size, sizeof(struct value), (compfn)compare ); gsl_vector_memcpy(x, oldx); if(size>P){ for(int i=0;i<P;i++){ if(rank == table[i].ID) gsl_vector_set(x, id, gsl_vector_get(oldx, id) + gsl_vector_get(d, id)); } }else gsl_vector_set(x, id, gsl_vector_get(oldx, id) + gsl_vector_get(d, id)); local_new_obj = objective(x, lambda, z, size); MPI_Allreduce(&local_new_obj, &global_new_obj, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } /*------------------------------ calculate the relative error ------------------------------*/ gsl_vector_memcpy(xdiff,xs); gsl_vector_sub(xdiff, x); err = gsl_blas_dnrm2(xdiff); send[0] = err*err; MPI_Allreduce(send, recv, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); recv[0] = sqrt(recv[0])/xs_nrm[0]; endTime = MPI_Wtime(); if(mode==1) P = num_upd*size; if (rank == 0) { if(iter%5 == 0) printf("%3d %10.2e %10.4f %3d\n", iter, recv[0], global_new_obj, P); fprintf(test, "%e \n",recv[0]); } /* termination check */ if(recv[0] < TOL){ break; } iter++; } total_end_time = MPI_Wtime(); /* Have the master write out the results to disk */ if (rank == 0) { printf("=============================\n"); printf("| GRock solved Lasso! |\n"); printf("|---------------------------|\n"); printf("|Summary: |\n"); printf("| # of iteration: %d |\n", iter); printf("| relative error: %4.2e|\n", recv[0]); printf("| objective value: %4.2f |\n", global_new_obj); printf("| time: %4.1es|\n", total_end_time - total_start_time); printf("=============================\n"); fprintf(test,"] \n"); fprintf(test,"semilogy(1:length(res),res); \n"); fprintf(test,"xlabel('# of iteration'); ylabel('||x - xs||');\n"); fclose(test); f = fopen("results/solution.dat", "w"); fprintf(f,"x = [ \n"); gsl_vector_fprintf(f, x, "%lf"); fprintf(f,"] \n"); fclose(f); endTime = MPI_Wtime(); } MPI_Finalize(); /* Shut down the MPI execution environment */ /* Clear memory */ gsl_matrix_free(A); gsl_vector_free(b); gsl_vector_free(x); gsl_vector_free(z); gsl_vector_free(xdiff); gsl_vector_free(Ax); gsl_vector_free(As); gsl_vector_free(invAs); gsl_vector_free(tmpx); gsl_vector_free(oldx); gsl_vector_free(local_b); gsl_vector_free(beta); gsl_vector_free(tmpz); gsl_vector_free(absd); gsl_vector_free(Atmpx); gsl_permutation_free(idx); return 0; }