static int scr_swap_files_copy( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* open the file to send: read-only mode */ int fd_send = -1; if (have_outgoing) { fd_send = scr_open(file_send, O_RDONLY); if (fd_send < 0) { scr_abort(-1, "Opening file for send: scr_open(%s, O_RDONLY) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } /* open the file to recv: truncate, write-only mode */ int fd_recv = -1; if (have_incoming) { mode_t mode_file = scr_getmode(1, 1, 0); fd_recv = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd_recv < 0) { scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int nread, nwrite; int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } while (sending || receiving) { /* if we are still receiving a file, post a receive */ if (receiving) { MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } /* if we are still sending a file, read a chunk, send it, and wait */ if (sending) { nread = scr_read(file_send, fd_send, buf_send, scr_mpi_buf_size); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); if (nread < scr_mpi_buf_size) { sending = 0; } } /* if we are still receiving a file, * wait on our receive to complete and write the data */ if (receiving) { MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } scr_write(file_recv, fd_recv, buf_recv, nwrite); if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close the files */ if (have_outgoing) { scr_close(file_send, fd_send); } if (have_incoming) { scr_close(file_recv, fd_recv); } /* set crc field on our file if it hasn't been set already */ if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { scr_meta_set_crc32(meta_send, *crc32_send); } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }
extern "C" void *vmd_mpi_parallel_for_scheduler(void *voidparms) { parallel_for_parms *parfor = (parallel_for_parms *) voidparms; // Run the for loop management code on node zero. // Do the work on all the other nodes... #if defined(VMDTHREADS) int i; wkf_tasktile_t curtile; while (wkf_shared_iterator_next_tile(&parfor->iter, 1, &curtile) != WKF_SCHED_DONE) { i = curtile.start; #else int i; for (i=parfor->loop.start; i<parfor->loop.end; i++) { #endif int reqnode; MPI_Status rcvstat; MPI_Recv(&reqnode, 1, MPI_INT, MPI_ANY_SOURCE, VMD_MPI_TAG_FOR_REQUEST, MPI_COMM_WORLD, &rcvstat); MPI_Send(&i, 1, MPI_INT, reqnode, VMD_MPI_TAG_FOR_REQUEST, MPI_COMM_WORLD); } // tell all nodes we're done with all of the work int node; for (node=1; node<parfor->numnodes; node++) { int reqnode; MPI_Status rcvstat; MPI_Recv(&reqnode, 1, MPI_INT, MPI_ANY_SOURCE, VMD_MPI_TAG_FOR_REQUEST, MPI_COMM_WORLD, &rcvstat); i=-1; // indicate that the for loop is completed MPI_Send(&i, 1, MPI_INT, reqnode, VMD_MPI_TAG_FOR_REQUEST, MPI_COMM_WORLD); } return NULL; } #endif int text_cmd_parallel(ClientData cd, Tcl_Interp *interp, int argc, const char *argv[]) { VMDApp *app = (VMDApp *)cd; if(argc<2) { Tcl_SetResult(interp, (char *) "Parallel job query commands:\n" " parallel nodename\n" " parallel noderank\n" " parallel nodecount\n" "Parallel collective operations (all nodes MUST participate):\n" " parallel allgather <object>\n" " parallel allreduce <tcl reduction proc> <object>\n" " parallel barrier\n" " parallel for <startcount> <endcount> <tcl callback proc> <user data>", TCL_STATIC); return TCL_ERROR; } // XXX hack to make Swift/T cooperate with VMD when using VMD's MPI // communicator if (!strcmp(argv[1], "swift_clone_communicator")) { swift_mpi_init(interp); return TCL_OK; } // return the MPI node name if (!strcmp(argv[1], "nodename")) { Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL); Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj(app->par_name(), strlen(app->par_name()))); Tcl_SetObjResult(interp, tcl_result); return TCL_OK; } // return the MPI node rank if (!strcmp(argv[1], "noderank")) { Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL); Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewIntObj(app->par_rank())); Tcl_SetObjResult(interp, tcl_result); return TCL_OK; } // return the MPI node count if (!strcmp(argv[1], "nodecount")) { Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL); Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewIntObj(app->par_size())); Tcl_SetObjResult(interp, tcl_result); return TCL_OK; } // execute an MPI barrier if(!strupncmp(argv[1], "barrier", CMDLEN) && argc==2) { app->par_barrier(); return TCL_OK; } // Execute a parallel for loop across all nodes // // parallel for <startcount> <endcount> <callback proc> <user data>", // if(!strupncmp(argv[1], "for", CMDLEN)) { int isok = (argc == 6); int N = app->par_size(); int start, end; if (Tcl_GetInt(interp, argv[2], &start) != TCL_OK || Tcl_GetInt(interp, argv[3], &end) != TCL_OK) { isok = 0; } // // If there's only one node, short-circuit the parallel for // if (N == 1) { if (!isok) { Tcl_SetResult(interp, (char *) "invalid parallel for, missing parameter", TCL_STATIC); return TCL_ERROR; } // run for loop on one node... int i; for (i=start; i<=end; i++) { char istr[128]; sprintf(istr, "%d", i); if (Tcl_VarEval(interp, argv[4], " ", istr, " {", argv[5], "} ", NULL) != TCL_OK) { Tcl_SetResult(interp, (char *) "error occured during parallel for", TCL_STATIC); } } return TCL_OK; } #if defined(VMDMPI) int allok = 0; // Check all node result codes before we continue with the reduction MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); // XXX we may want to verify that all nodes are going to call the same // reduction proc here before continuing further. if (!allok) { Tcl_SetResult(interp, (char *) "invalid parallel for, missing parameter on one or more nodes", TCL_STATIC); return TCL_ERROR; } // Run the for loop management code on node zero. // Do the work on all the other nodes... int i; if (app->par_rank() == 0) { // use multithreaded code path parallel_for_parms parfor; memset(&parfor, 0, sizeof(parfor)); parfor.numnodes = N; parfor.loop.start=start; parfor.loop.end=end+1; wkf_shared_iterator_init(&parfor.iter); wkf_shared_iterator_set(&parfor.iter, &parfor.loop); #if defined(VMDTHREADS) // run the MPI scheduler in a new child thread wkf_thread_t pft; wkf_thread_create(&pft, vmd_mpi_parallel_for_scheduler, &parfor); // run the Tcl in the main thread wkf_tasktile_t curtile; while (wkf_shared_iterator_next_tile(&parfor.iter, 1, &curtile) != WKF_SCHED_DONE) { i = curtile.start; char istr[128]; sprintf(istr, "%d", i); if (Tcl_VarEval(interp, argv[4], " ", istr, " {", argv[5], "} ", NULL) != TCL_OK) { Tcl_SetResult(interp, (char *) "error occured during parallel for", TCL_STATIC); } } // join up with the MPI scheduler thread wkf_thread_join(pft, NULL); #else // if no threads, node zero only runs the scheduler and doesn't do work vmd_mpi_parallel_for_scheduler(&parfor); #endif wkf_shared_iterator_destroy(&parfor.iter); } else { char istr[128]; int done=0; int mynode=app->par_rank(); while (!done) { MPI_Send(&mynode, 1, MPI_INT, 0, VMD_MPI_TAG_FOR_REQUEST, MPI_COMM_WORLD); MPI_Status rcvstat; MPI_Recv(&i, 1, MPI_INT, MPI_ANY_SOURCE, VMD_MPI_TAG_FOR_REQUEST, MPI_COMM_WORLD, &rcvstat); if (i == -1) { done = 1; } else { sprintf(istr, "%d", i); if (Tcl_VarEval(interp, argv[4], " ", istr, " {", argv[5], "} ", NULL) != TCL_OK) { Tcl_SetResult(interp, (char *) "error occured during parallel for", TCL_STATIC); } } } } #endif return TCL_OK; } // Execute an allgather producing a Tcl list of the per-node contributions // // parallel allgather <object> // if(!strupncmp(argv[1], "allgather", CMDLEN)) { int isok = (argc == 3); #if defined(VMDMPI) int allok = 0; int i; // Check all node result codes before we continue with the gather MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); if (!allok) { Tcl_SetResult(interp, (char *) "invalid parallel gather, missing parameter on one or more nodes", TCL_STATIC); return TCL_ERROR; } // Collect parameter size data so we can allocate result buffers // before executing the gather int *szlist = new int[app->par_size()]; szlist[app->par_rank()] = strlen(argv[2])+1; #if defined(USE_MPI_IN_PLACE) // MPI >= 2.x implementations (e.g. NCSA/Cray Blue Waters) MPI_Allgather(MPI_IN_PLACE, 1, MPI_INT, &szlist[0], 1, MPI_INT, MPI_COMM_WORLD); #else // MPI 1.x MPI_Allgather(&szlist[app->par_rank()], 1, MPI_INT, &szlist[0], 1, MPI_INT, MPI_COMM_WORLD); #endif int totalsz = 0; int *displist = new int[app->par_size()]; for (i=0; i<app->par_size(); i++) { displist[i]=totalsz; totalsz+=szlist[i]; } char *recvbuf = new char[totalsz]; memset(recvbuf, 0, totalsz); // Copy this node's data into the correct array position strcpy(&recvbuf[displist[app->par_rank()]], argv[2]); // Perform the parallel gather #if defined(USE_MPI_IN_PLACE) // MPI >= 2.x implementations (e.g. NCSA/Cray Blue Waters) MPI_Allgatherv(MPI_IN_PLACE, szlist[app->par_rank()], MPI_BYTE, &recvbuf[0], szlist, displist, MPI_BYTE, MPI_COMM_WORLD); #else // MPI 1.x MPI_Allgatherv(&recvbuf[displist[app->par_rank()]], szlist[app->par_rank()], MPI_BYTE, &recvbuf[0], szlist, displist, MPI_BYTE, MPI_COMM_WORLD); #endif // Build Tcl result from the array of results Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL); for (i=0; i<app->par_size(); i++) { Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj(&recvbuf[displist[i]], szlist[i]-1)); } Tcl_SetObjResult(interp, tcl_result); delete [] recvbuf; delete [] displist; delete [] szlist; return TCL_OK; #else if (!isok) { Tcl_SetResult(interp, (char *) "invalid parallel gather, missing parameter on one or more nodes", TCL_STATIC); return TCL_ERROR; } Tcl_Obj *tcl_result = Tcl_NewListObj(0, NULL); Tcl_ListObjAppendElement(interp, tcl_result, Tcl_NewStringObj(argv[2], strlen(argv[2]))); Tcl_SetObjResult(interp, tcl_result); return TCL_OK; #endif } // // Execute an All-Reduce across all of the nodes. // The user must provide a Tcl proc that performs the appropriate reduction // operation for a pair of data items, resulting in a single item. // Since the user may pass floating point data or perform reductions // that give very slightly different answers depending on the order of // operations, the architecture or the host, or whether reductions on // a given host are occuring on the CPU or on a heterogeneous accelerator // or GPU of some kind, we must ensure that all nodes get a bit-identical // result. When heterogeneous accelerators are involved, we can really // only guarantee this by implementing the All-Reduce with a // Reduce-then-Broadcast approach, where the reduction collapses the // result down to node zero, which then does a broadcast to all peers. // // parallel allreduce <tcl reduction proc> <object> // if(!strupncmp(argv[1], "allreduce", CMDLEN)) { int isok = (argc == 4); int N = app->par_size(); // // If there's only one node, short-circuit the full parallel reduction // if (N == 1) { if (!isok) { Tcl_SetResult(interp, (char *) "invalid parallel reduction, missing parameter", TCL_STATIC); return TCL_ERROR; } // return our result, no other reduction is necessary Tcl_SetObjResult(interp, Tcl_NewStringObj(argv[3], strlen(argv[3]))); return TCL_OK; } #if 1 && defined(VMDMPI) // // All-Reduce implementation based on a ring reduction followed by a // broadcast from node zero. This implementation gaurantees strict // ordering and will properly handle the case where one or more nodes // perform their reduction with slightly differing floating point // rounding than others (e.g. using GPUs, heterogeneous nodes, etc), // and it works with any number of nodes. While NOT latency-optimal, // this implementation is close to bandwidth-optimal which is helpful // for workstation clusters on non-switched networks or networks with // switches that cannot operate in a fully non-blocking manner. // int allok = 0; // Check all node result codes before we continue with the reduction MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); // XXX we may want to verify that all nodes are going to call the same // reduction proc here before continuing further. if (!allok) { Tcl_SetResult(interp, (char *) "invalid parallel reduction, missing parameter on one or more nodes", TCL_STATIC); return TCL_ERROR; } // copy incoming data into initial "result" object Tcl_Obj *resultobj = Tcl_NewStringObj((const char *) argv[3], strlen(argv[3])+1); // A ring-based all-reduce implementation which should be // close to bandwidth-optimal, at the cost of additional latency. int src=app->par_rank(); // src node is this node int Ldest = (N + src + 1) % N; // compute left peer int Rdest = (N + src - 1) % N; // compute right peer MPI_Status status; if (src != 0) { int recvsz = 0; // Post blocking receive for data size MPI_Recv(&recvsz, 1, MPI_INT, Ldest, VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD, &status); // Allocate or resize receive buffer char * recvbuf = (char *) malloc(recvsz); // Post non-blocking receive for data MPI_Recv(recvbuf, recvsz, MPI_BYTE, Ldest, VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD, &status); // Perform reduction // Perform the reduction operation on our existing and incoming data. // We build a Tcl command string with the user-defined proc, this // node's previous resultand, and the incoming data, and evaluate it. if (Tcl_VarEval(interp, argv[2], " ", Tcl_GetString(resultobj), " ", recvbuf, NULL) != TCL_OK) { printf("Error occured during reduction!\n"); } // Prep for next reduction step. Set result object to result of // the latest communication/reduction phase. resultobj = Tcl_GetObjResult(interp); // Free the receive buffer free(recvbuf); } // // All nodes // char *sendbuf = Tcl_GetString(resultobj); int sendsz = strlen(sendbuf)+1; // Post blocking send for data size MPI_Send(&sendsz, 1, MPI_INT, Rdest, VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD); // Post blocking send for data MPI_Send(sendbuf, sendsz, MPI_BYTE, Rdest, VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD); if (src == 0) { int recvsz = 0; // Post blocking receive for data size MPI_Recv(&recvsz, 1, MPI_INT, Ldest, VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD, &status); // Allocate or resize receive buffer char * recvbuf = (char *) malloc(recvsz); // Post non-blocking receive for data MPI_Recv(recvbuf, recvsz, MPI_BYTE, Ldest, VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD, &status); // Perform reduction // Perform the reduction operation on our existing and incoming data. // We build a Tcl command string with the user-defined proc, this // node's previous result and the incoming data, and evaluate it. if (Tcl_VarEval(interp, argv[2], " ", Tcl_GetString(resultobj), " ", recvbuf, NULL) != TCL_OK) { printf("Error occured during reduction!\n"); } // Prep for next reduction step. Set result object to result of // the latest communication/reduction phase. resultobj = Tcl_GetObjResult(interp); // Free the receive buffer free(recvbuf); } // // Broadcast final result from root to peers // if (src == 0) { // update send buffer for root node before broadcast sendbuf = Tcl_GetString(resultobj); sendsz = strlen(sendbuf)+1; MPI_Bcast(&sendsz, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(sendbuf, sendsz, MPI_BYTE, 0, MPI_COMM_WORLD); } else { int recvsz = 0; MPI_Bcast(&recvsz, 1, MPI_INT, 0, MPI_COMM_WORLD); // Allocate or resize receive buffer char * recvbuf = (char *) malloc(recvsz); MPI_Bcast(recvbuf, recvsz, MPI_BYTE, 0, MPI_COMM_WORLD); // Set the final Tcl result if necessary Tcl_SetObjResult(interp, Tcl_NewStringObj(recvbuf, recvsz-1)); // Free the receive buffer free(recvbuf); } return TCL_OK; #elif defined(VMDMPI) // // Power-of-two-only hypercube/butterfly/recursive doubling // All-Reduce implementation. This implementation can't be used // in the case that we have either a non-power-of-two node count or // in the case where we have heterogeneous processing units that may // yield different floating point rounding. For now we leave this // implementation in the code for performance comparisons until we work // out the changes necessary to make it closer to bandwidth-optimal, // heterogeneous-safe, and non-power-of-two capable. // int allok = 0; int i; // Check all node result codes before we continue with the reduction MPI_Allreduce(&isok, &allok, 1, MPI_INT, MPI_LAND, MPI_COMM_WORLD); // XXX we may want to verify that all nodes are going to call the same // reduction proc here before continuing further. if (!allok) { Tcl_SetResult(interp, (char *) "invalid parallel reduction, missing parameter on one or more nodes", TCL_STATIC); return TCL_ERROR; } // Calculate number of reduction phases required int log2N; for (log2N=0; N>1; N>>=1) { log2N++; // XXX bail out of we don't have a power-of-two node count, // at least until we implement 3-2 reduction phases if ((N & 1) && (N > 1)) { Tcl_SetResult(interp, (char *) "parallel allreduce only allowed for even power-of-two node count", TCL_STATIC); return TCL_ERROR; } } N = app->par_size(); // copy incoming data into initial "result" object Tcl_Obj *resultobj = Tcl_NewStringObj((const char *) argv[3], strlen(argv[3])+1); // An all-reduce tree with hypercube connectivity with // log2(N) communication/reduction phases. At each phase, we compute // the peer/destination node we will communicate with using an XOR of // our node ID with the current hypercube dimension. If we have an // incomplete hypercube topology (e.g. non-power-of-two node count), // we have to do special 3-2 communication rounds (not implemented yet). // The current implementation requires that all existing nodes // participate, and that they contribute a valid data item. // If we wish to support reductions where a node may not contribute, // we would need to handle that similarly to a peer node that doesn't // exist, but we would likely determine this during the parameter length // exchange step. int src=app->par_rank(); // src node is this node for (i=0; i<log2N; i++) { int mask = 1 << i; // generate bitmask to use in the XOR int dest = src ^ mask; // XOR src node with bitmask to find dest node Tcl_Obj *oldresultobj = resultobj; // track old result // Check to make sure dest node exists for non-power-of-two // node counts (an incomplete hypercube). If not, skip to the next // communication/reduction phase. if (dest < N) { char *sendbuf = Tcl_GetString(oldresultobj); int sendsz = strlen(sendbuf)+1; int recvsz = 0; MPI_Request handle; MPI_Status status; // // Exchange required receive buffer size for data exchange with peer // // Post non-blocking receive for data size MPI_Irecv(&recvsz, 1, MPI_INT, dest, VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD, &handle); // Post blocking send for data size MPI_Send(&sendsz, 1, MPI_INT, dest, VMD_MPI_TAG_ALLREDUCE_ARGLENGTH, MPI_COMM_WORLD); // Wait for non-blocking receive of data size to complete MPI_Wait(&handle, &status); // printf("src[%d], dest[%d], value '%s', recvsz: %d\n", src, dest, sendbuf, recvsz); // Allocate or resize receive buffer char * recvbuf = (char *) malloc(recvsz); // // Exchange the data payload // // Post non-blocking receive for data MPI_Irecv(recvbuf, recvsz, MPI_BYTE, dest, VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD, &handle); // Post blocking send for data MPI_Send(sendbuf, sendsz, MPI_BYTE, dest, VMD_MPI_TAG_ALLREDUCE_PAYLOAD, MPI_COMM_WORLD); // Wait for receive of data MPI_Wait(&handle, &status); // Perform the reduction operation on our existing and incoming data. // We build a Tcl command string with the user-defined proc, this // node's previous result and the incoming data, and evaluate it. if (Tcl_VarEval(interp, argv[2], " ", sendbuf, " ", recvbuf, NULL) != TCL_OK) { printf("Error occured during reduction!\n"); } // Free the receive buffer free(recvbuf); // Prep for next reduction step. Set result object to result of // the latest communication/reduction phase. resultobj = Tcl_GetObjResult(interp); } } // Set the final Tcl result if necessary Tcl_SetObjResult(interp, resultobj); return TCL_OK; #endif }
void IMB_ialltoall_pure(struct comm_info* c_info, int size, struct iter_schedule* ITERATIONS, MODES RUN_MODE, double* time) /* MPI-NBC benchmark kernel Benchmarks MPI_Ialltoall. Input variables: -c_info (type struct comm_info*) Collection of all base data for MPI; see [1] for more information -size (type int) Basic message size in bytes -ITERATIONS (type struct iter_schedule *) Repetition scheduling -RUN_MODE (type MODES) (only MPI-2 case: see [1]) Output variables: -time (type double*) Timing result per sample */ { int i = 0; Type_Size s_size, r_size; int s_num = 0, r_num; MPI_Request request; MPI_Status status; double t_pure = 0.; #ifdef CHECK defect=0.; #endif ierr = 0; /* GET SIZE OF DATA TYPE */ MPI_Type_size(c_info->s_data_type, &s_size); MPI_Type_size(c_info->s_data_type, &r_size); if ((s_size != 0) && (r_size != 0)) { s_num = size / s_size; r_num = size / r_size; } if(c_info->rank != -1) { for (i = 0; i < N_BARR; i++) { MPI_Barrier(c_info->communicator); } t_pure = MPI_Wtime(); for(i = 0; i < ITERATIONS->n_sample; i++) { ierr = MPI_Ialltoall((char*)c_info->s_buffer + i % ITERATIONS->s_cache_iter * ITERATIONS->s_offs, s_num, c_info->s_data_type, (char*)c_info->r_buffer + i % ITERATIONS->r_cache_iter * ITERATIONS->r_offs, r_num, c_info->r_data_type, c_info->communicator, &request); MPI_ERRHAND(ierr); MPI_Wait(&request, &status); CHK_DIFF("Ialltoall_pure", c_info, (char*)c_info->r_buffer + i % ITERATIONS->r_cache_iter * ITERATIONS->r_offs, ((size_t)c_info->rank * (size_t) size), 0, ((size_t)c_info->num_procs * (size_t)size), 1, put, 0, ITERATIONS->n_sample, i, -2, &defect); } t_pure = (MPI_Wtime() - t_pure) / ITERATIONS->n_sample; } time[0] = t_pure; }
int main( int argc, char *argv[] ) { double sbuf[20000]; #ifdef FOO double rbuf[20000]; #endif int rank; int n, flag, size; int err = 0; int verbose = 0; MPI_Status status; MPI_Request req; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); MPI_Comm_size( MPI_COMM_WORLD, &size ); if (size < 2) { printf( "Cancel test requires at least 2 processes\n" ); MPI_Abort( MPI_COMM_WORLD, 1 ); } /* Short Message Test */ n = 200; if (rank == 1) { /* begin if rank = 1 */ MPI_Isend( sbuf, n, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &req ); MPI_Cancel(&req); MPI_Wait(&req, &status); MPI_Test_cancelled(&status, &flag); if (!flag) { err++; printf( "Cancelling a short message failed where it should succeed.\n" ); } else if (verbose) { printf("Cancelling a short message succeeded.\n"); } } /* end if rank == 1 */ #ifdef FOO /* Note that MPI-2 specifies that status.MPI_ERROR is only set by multiple completion (e.g., MPI_Waitsome) and not by test_cancelled. */ MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { /* begin if rank == 0 */ MPI_Recv( rbuf, n, MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &status); } /* end if rank = 0 */ else if (rank == 1) { /* begin if rank = 1 */ MPI_Isend( sbuf, n, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &req ); MPI_Cancel(&req); MPI_Wait(&req, &status); MPI_Test_cancelled(&status, &flag); if (!flag && status.MPI_ERROR != MPI_SUCCESS) { err++; printf( "Cancel of a send returned an error in the status field.\n" ); } /* end if status.MPI_ERROR */ } /* end if rank == 1 */ #endif MPI_Barrier(MPI_COMM_WORLD); /* Eager Message Test */ n = 3000; if (rank == 1) { /* begin if rank = 1 */ MPI_Isend( sbuf, n, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &req ); MPI_Cancel(&req); MPI_Wait(&req, &status); MPI_Test_cancelled(&status, &flag); if (!flag) { err++; printf( "Cancelling an eager message (3000 doubles) failed where it should succeed.\n" ); } else if (verbose) { printf("Cancelling an eager message (3000 doubles) succeeded.\n"); } } /* end if rank == 1 */ #ifdef FOO MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { /* begin if rank == 0 */ MPI_Irecv(rbuf, n, MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &req ); MPI_Wait( &req, &status); } /* end if rank = 0 */ else if (rank == 1) { /* begin if rank = 1 */ MPI_Isend( sbuf, n, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &req ); MPI_Cancel(&req); MPI_Wait(&req, &status); MPI_Test_cancelled(&status, &flag); if (!flag && status.MPI_ERROR != MPI_SUCCESS) { err++; printf( "Cancel of a send returned an error in the status field.\n" ); } /* end if status.MPI_ERROR */ } /* end if rank == 1 */ #endif MPI_Barrier(MPI_COMM_WORLD); /* Rndv Message Test */ n = 20000; if (rank == 1) { /* begin if rank = 1 */ MPI_Isend( sbuf, n, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &req ); MPI_Cancel(&req); MPI_Wait(&req, &status); MPI_Test_cancelled(&status, &flag); if (!flag) { err++; printf( "Cancelling a rendezvous message failed (20000 doubles) where it should succeed.\n" ); } else if (verbose) { printf("Cancelling an rendezvous message (20000 doubles) succeeded.\n"); } } /* end if rank == 1 */ #ifdef FOO MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { /* begin if rank == 0 */ MPI_Irecv(rbuf, n, MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &req ); MPI_Wait( &req, &status); } /* end if rank = 0 */ else if (rank == 1) { /* begin if rank = 1 */ MPI_Isend( sbuf, n, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &req ); MPI_Cancel(&req); MPI_Wait(&req, &status); MPI_Test_cancelled(&status, &flag); if (!flag && status.MPI_ERROR != MPI_SUCCESS) { err++; printf( "Cancel of a send returned an error in the status field.\n" ); } /* end if status.MPI_ERROR */ } /* end if rank == 1 */ #endif MPI_Barrier(MPI_COMM_WORLD); if (rank == 1) { /* begin if rank = 1 */ if (err) { printf( "Test failed with %d errors.\n", err ); } else { printf( " No Errors\n" ); } } MPI_Finalize( ); return 0; }
static int xattr_basic_test(int ea_name_size, int ea_value_size) { int ret, fd = -1; int sub_testno = 1; char dest[PATH_MAX]; MPI_Request request; MPI_Status status; unsigned long i, j; xattr_name_sz = ea_name_size; xattr_value_sz = ea_value_size; snprintf(orig_path, PATH_MAX, "%s/multi_original_xattr_refile", workplace); snprintf(dest, PATH_MAX, "%s_target", orig_path); root_printf(" *SubTest %d: Prep original inode.\n", sub_testno++); if (!rank) { ret = prep_orig_file(orig_path, file_size, 1); should_exit(ret); } MPI_Barrier_Sync(); root_printf(" *SubTest %d: Prep %ld xattr name list among nodes.\n", sub_testno++, xattr_nums); for (i = 0; i < xattr_nums; i++) { memset(xattr_name, 0, xattr_name_sz + 1); memset(xattr_value, 0, xattr_value_sz); memset(xattr_value_get, 0, xattr_value_sz); if (!rank) { xattr_name_generator(i, USER, xattr_name_sz, xattr_name_sz); strcpy(xattr_name_list_set[i], xattr_name); for (j = 1; j < size; j++) { ret = MPI_Isend(xattr_name, xattr_name_sz + 1, MPI_BYTE, j, 1, MPI_COMM_WORLD, &request); if (ret != MPI_SUCCESS) abort_printf("MPI_Isend failed: %d\n", ret); MPI_Wait(&request, &status); } } else { ret = MPI_Irecv(xattr_name, xattr_name_sz + 1, MPI_BYTE, 0, 1, MPI_COMM_WORLD, &request); if (ret != MPI_SUCCESS) abort_printf("MPI_Irecv failed: %d\n", ret); MPI_Wait(&request, &status); strcpy(xattr_name_list_set[i], xattr_name); } } MPI_Barrier_Sync(); root_printf(" *SubTest %d: Prep original inode with %ld EAs.\n", sub_testno++, xattr_nums); if (!rank) { fd = open64(orig_path, open_rw_flags); for (i = 0; i < xattr_nums; i++) { strcpy(xattr_name, xattr_name_list_set[i]); xattr_value_constructor(i); ret = add_or_update_ea(NORMAL, fd, XATTR_CREATE, "add"); should_exit(ret); ret = read_ea(NORMAL, fd); should_exit(ret); ret = xattr_value_validator(i); should_exit(ret); } ret = do_reflinks(orig_path, orig_path, ref_counts, 0); should_exit(ret); ret = reflink(orig_path, dest, 1); should_exit(ret); } MPI_Barrier_Sync(); if (rank % 6 == 1) { /*also doing reflinks and unlinks*/ printf(" *SubTest Rank %d: Do reflinks and cows on %ld EAs.\n", rank, xattr_nums); ret = do_reflinks(dest, dest, ref_counts, 0); should_exit(ret); ret = do_xattr_cows(dest, ref_counts, xattr_nums); should_exit(ret); ret = do_unlinks(dest, ref_counts); should_exit(ret); } if (rank % 6 == 2) { printf(" *SubTest Rank %d: Do cows on %ld EAs.\n", rank, xattr_nums); ret = do_xattr_cows(orig_path, ref_counts, xattr_nums); should_exit(ret); } if (rank % 6 == 3) { printf(" *SubTest Rank %d: Do data&ea cows on %ld EAs.\n", rank, xattr_nums); ret = do_xattr_data_cows(orig_path, ref_counts, xattr_nums); should_exit(ret); } if (rank % 6 == 4) { printf(" *SubTest Rank %d: Do reads on %ld EAs.\n", rank, xattr_nums); xattr_value_sz = XATTR_VALUE_MAX_SZ; ret = do_xattr_reads(orig_path, ref_counts, xattr_nums); should_exit(ret); } if (rank % 6 == 5) { printf(" *SubTest Rank %d: Do lists on %ld EAs.\n", rank, xattr_nums); if (list_sz < XATTR_LIST_MAX_SZ) { ret = do_xattr_lists(orig_path, ref_counts); should_exit(ret); } } MPI_Barrier_Sync(); if (!rank) { printf(" *SubTest Rank %d: Do EA removal.\n", rank); ret = do_xattr_removes(orig_path, ref_counts, xattr_nums); should_exit(ret); } MPI_Barrier_Sync(); if (!rank) { close(fd); ret = do_unlinks(orig_path, ref_counts); should_exit(ret); ret = do_unlink(dest); should_exit(ret); ret = do_unlink(orig_path); should_exit(ret); } return 0; }
int main(int argc, char *argv[]) { int rank, nprocs; int i, token1, token2, tag1 = 11, tag2 = 22, rounds, flag1, flag2; int left, right; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Status status1, status2; MPI_Request reqs1[2], reqs2[2]; if ((rounds = atoi(argv[1])) <= 0) { if (rank == 0) fprintf(stderr, "invalid rounds: %s\n", argv[1]); MPI_Finalize(); return 0; } token1 = 0; token2 = nprocs * rounds; right = (rank + 1) % nprocs; left = (nprocs + rank - 1) % nprocs; /* Process with rank 0 sends the first token */ if (rank == 0) { MPI_Isend(&token1, 1, MPI_INT, right, tag1, MPI_COMM_WORLD, &reqs1[1]);MPI_Wait(&reqs1[1],&status1); MPI_Isend(&token2, 1, MPI_INT, left, tag2, MPI_COMM_WORLD, &reqs1[0]);MPI_Wait(&reqs1[0],&status2); } for (i= 0; i < rounds; i++) { MPI_Irecv(&token2, 1, MPI_INT, right, tag2, MPI_COMM_WORLD, &reqs2[0]); MPI_Test(&reqs2[0], &flag2, &status2); while (flag2 == 0) { MPI_Irecv(&token1, 1, MPI_INT, left, tag1, MPI_COMM_WORLD, &reqs2[1]); MPI_Test(&reqs2[0], &flag2, &status2); } printf("Rank %3d: received token1 %2d from rank %3d. (counter clock-wise)\n", rank, token2, right); token2 = token2 - 1; //sleep(1); /* To see the output printed one by one simulating message passing */ if (((rank != 0) || (i < (rounds - 1))) ) { /* Only send it on if we are not rank 0, or therefsdf are more rounds to go */ MPI_Isend(&token2, 1, MPI_INT, left, tag2, MPI_COMM_WORLD, &reqs1[0]);MPI_Wait(&reqs1[0],&status2); } MPI_Test(&reqs2[1], &flag1, &status1); while (flag1 == 0) { MPI_Test(&reqs2[1], &flag1, &status1); } printf("Rank %3d: received token1 %2d from rank %3d. (clock-wise)\n", rank, token1, left); token1 = token1 + 1; //sleep(1); /* To see the output printed one by one simulating message passing */ if (((rank != 0) || (i < (rounds - 1))) ) { /* Only send it on if we are not rank 0, or there are more rounds to go */ MPI_Isend(&token1, 1, MPI_INT, right, tag1, MPI_COMM_WORLD, &reqs1[1]);MPI_Wait(&reqs1[1],&status1); } } MPI_Finalize(); return 0; }
void Output_Charge_Density(int MD_iter) { int i,j,k,i1,i2,i3,c; int GN,mul,n1,n2,n3,nn1,nn0; int cmd,MN,MN0,MN1,MN2,MN3; double ***V; double *tmp_array0; double *tmp_array1; int numprocs,myid,ID,IDS,IDR,tag=999; char operate[300]; char fname1[300]; char fname2[300]; FILE *fp; char fileCD0[YOUSO10]; char fileCD1[YOUSO10]; char fileCD2[YOUSO10]; char buf[fp_bsize]; /* setvbuf */ MPI_Status stat; MPI_Request request; /* MPI */ MPI_Comm_size(mpi_comm_level1,&numprocs); MPI_Comm_rank(mpi_comm_level1,&myid); if (SpinP_switch==0) mul = 1; else if (SpinP_switch==1) mul = 2; else if (SpinP_switch==3) mul = 4; /**************************************************** allocation of arrays: double V[mul][My_NGrid1_Poisson][Ngrid2*Ngrid3]; ****************************************************/ V = (double***)malloc(sizeof(double**)*mul); for (k=0; k<mul; k++){ V[k] = (double**)malloc(sizeof(double*)*My_NGrid1_Poisson); for (i=0; i<My_NGrid1_Poisson; i++){ V[k][i] = (double*)malloc(sizeof(double)*Ngrid2*Ngrid3); } } /**************************************************** set V0 and V1 ****************************************************/ /* initialize */ for (k=0; k<mul; k++){ for (n1=0; n1<My_NGrid1_Poisson; n1++){ for (n2=0; n2<Ngrid2*Ngrid3; n2++){ V[k][n1][n2] = 0.0; } } } /* use their densities using MPI */ for (k=0; k<mul; k++){ for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; /* Isend */ if (Num_Snd_Grid1[IDS]!=0){ tmp_array0 = (double*)malloc(sizeof(double)*Num_Snd_Grid1[IDS]*Ngrid2*Ngrid3); for (i=0; i<Num_Snd_Grid1[IDS]; i++){ n1 = Snd_Grid1[IDS][i]; nn1 = My_Cell0[n1]; MN1 = nn1*Ngrid2*Ngrid3; MN0 = i*Ngrid2*Ngrid3; for (n2=0; n2<Ngrid2; n2++){ MN2 = n2*Ngrid3; if (k<=1){ for (n3=0; n3<Ngrid3; n3++){ MN = MN1 + MN2 + n3; tmp_array0[MN0+MN2+n3] = Density_Grid[k][MN] - 0.5*ADensity_Grid[MN]; } } else { for (n3=0; n3<Ngrid3; n3++){ MN = MN1 + MN2 + n3; tmp_array0[MN0+MN2+n3] = Density_Grid[k][MN]; } } } } MPI_Isend(&tmp_array0[0], Num_Snd_Grid1[IDS]*Ngrid2*Ngrid3, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request); } /* Recv */ if (Num_Rcv_Grid1[IDR]!=0){ tmp_array1 = (double*)malloc(sizeof(double)*Num_Rcv_Grid1[IDR]*Ngrid2*Ngrid3); MPI_Recv(&tmp_array1[0], Num_Rcv_Grid1[IDR]*Ngrid2*Ngrid3, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat); for (i=0; i<Num_Rcv_Grid1[IDR]; i++){ n1 = Rcv_Grid1[IDR][i]; nn1 = My_Cell0[n1]; nn0 = n1 - Start_Grid1[myid]; MN0 = i*Ngrid2*Ngrid3; for (n2=0; n2<Ngrid2; n2++){ MN2 = n2*Ngrid3; for (n3=0; n3<Ngrid3; n3++){ MN = MN0 + MN2 + n3; V[k][nn0][MN2+n3] = tmp_array1[MN]; } } } free(tmp_array1); } if (Num_Snd_Grid1[IDS]!=0){ MPI_Wait(&request,&stat); free(tmp_array0); } } /* use own densities */ for (n1=Start_Grid1[myid]; n1<=End_Grid1[myid]; n1++){ nn1 = My_Cell0[n1]; nn0 = n1 - Start_Grid1[myid]; if (nn1!=-1){ MN1 = nn1*Ngrid2*Ngrid3; for (n2=0; n2<Ngrid2; n2++){ MN2 = n2*Ngrid3; if (k<=1){ for (n3=0; n3<Ngrid3; n3++){ MN = MN1 + MN2 + n3; V[k][nn0][MN2+n3] = Density_Grid[k][MN] - 0.5*ADensity_Grid[MN]; } } else { for (n3=0; n3<Ngrid3; n3++){ MN = MN1 + MN2 + n3; V[k][nn0][MN2+n3] = Density_Grid[k][MN]; } } } } } } /* mul */ /**************************************************** output data ****************************************************/ for (k=0; k<mul; k++){ for (n1=0; n1<My_NGrid1_Poisson; n1++){ nn0 = n1 + Start_Grid1[myid]; if (Cnt_switch!=1){ for (i=(Extrapolated_Charge_History-2); 0<=i; i--){ sprintf(fileCD1,"%s%s_rst/%s.crst%i_%i_%i",filepath,filename,filename,k,nn0,i); sprintf(fileCD2,"%s%s_rst/%s.crst%i_%i_%i",filepath,filename,filename,k,nn0,i+1); if ((fp = fopen(fileCD1,"r")) != NULL){ fclose(fp); rename(fileCD1,fileCD2); } } } sprintf(fileCD0,"%s%s_rst/%s.crst%i_%i_0",filepath,filename,filename,k,nn0); if ((fp = fopen(fileCD0,"w")) != NULL){ fwrite(V[k][n1],sizeof(double),Ngrid2*Ngrid3,fp); fclose(fp); } else{ printf("Could not open a file %s\n",fileCD0); } } } /**************************************************** freeing of arrays: double V[mul][My_NGrid1_Poisson][Ngrid2*Ngrid3]; ****************************************************/ for (k=0; k<mul; k++){ for (i=0; i<My_NGrid1_Poisson; i++){ free(V[k][i]); } free(V[k]); } free(V); }
static PetscErrorCode MatIncreaseOverlap_MPISBAIJ_Once(Mat C,PetscInt is_max,IS is[]) { Mat_MPISBAIJ *c = (Mat_MPISBAIJ*)C->data; PetscErrorCode ierr; PetscMPIInt size,rank,tag1,tag2,*len_s,nrqr,nrqs,*id_r1,*len_r1,flag,len; const PetscInt *idx_i; PetscInt idx,isz,col,*n,*data1,**data1_start,*data2,*data2_i,*data,*data_i, Mbs,i,j,k,*odata1,*odata2, proc_id,**odata2_ptr,*ctable=0,*btable,len_max,len_est; PetscInt proc_end=0,*iwork,len_unused,nodata2; PetscInt ois_max; /* max no of is[] in each of processor */ char *t_p; MPI_Comm comm; MPI_Request *s_waits1,*s_waits2,r_req; MPI_Status *s_status,r_status; PetscBT *table; /* mark indices of this processor's is[] */ PetscBT table_i; PetscBT otable; /* mark indices of other processors' is[] */ PetscInt bs=C->rmap->bs,Bn = c->B->cmap->n,Bnbs = Bn/bs,*Bowners; IS garray_local,garray_gl; PetscFunctionBegin; comm = ((PetscObject)C)->comm; size = c->size; rank = c->rank; Mbs = c->Mbs; ierr = PetscObjectGetNewTag((PetscObject)C,&tag1);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)C,&tag2);CHKERRQ(ierr); /* create tables used in step 1: table[i] - mark c->garray of proc [i] step 3: table[i] - mark indices of is[i] when whose=MINE table[0] - mark incideces of is[] when whose=OTHER */ len = PetscMax(is_max, size);CHKERRQ(ierr); ierr = PetscMalloc2(len,PetscBT,&table,(Mbs/PETSC_BITS_PER_BYTE+1)*len,char,&t_p);CHKERRQ(ierr); for (i=0; i<len; i++) { table[i] = t_p + (Mbs/PETSC_BITS_PER_BYTE+1)*i; } ierr = MPI_Allreduce(&is_max,&ois_max,1,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); /* 1. Send this processor's is[] to other processors */ /*---------------------------------------------------*/ /* allocate spaces */ ierr = PetscMalloc(is_max*sizeof(PetscInt),&n);CHKERRQ(ierr); len = 0; for (i=0; i<is_max; i++) { ierr = ISGetLocalSize(is[i],&n[i]);CHKERRQ(ierr); len += n[i]; } if (!len) { is_max = 0; } else { len += 1 + is_max; /* max length of data1 for one processor */ } ierr = PetscMalloc((size*len+1)*sizeof(PetscInt),&data1);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt*),&data1_start);CHKERRQ(ierr); for (i=0; i<size; i++) data1_start[i] = data1 + i*len; ierr = PetscMalloc4(size,PetscInt,&len_s,size,PetscInt,&btable,size,PetscInt,&iwork,size+1,PetscInt,&Bowners);CHKERRQ(ierr); /* gather c->garray from all processors */ ierr = ISCreateGeneral(comm,Bnbs,c->garray,&garray_local);CHKERRQ(ierr); ierr = ISAllGather(garray_local, &garray_gl);CHKERRQ(ierr); ierr = ISDestroy(garray_local);CHKERRQ(ierr); ierr = MPI_Allgather(&Bnbs,1,MPIU_INT,Bowners+1,1,MPIU_INT,comm);CHKERRQ(ierr); Bowners[0] = 0; for (i=0; i<size; i++) Bowners[i+1] += Bowners[i]; if (is_max){ /* hash table ctable which maps c->row to proc_id) */ ierr = PetscMalloc(Mbs*sizeof(PetscInt),&ctable);CHKERRQ(ierr); for (proc_id=0,j=0; proc_id<size; proc_id++) { for (; j<C->rmap->range[proc_id+1]/bs; j++) { ctable[j] = proc_id; } } /* hash tables marking c->garray */ ierr = ISGetIndices(garray_gl,&idx_i); for (i=0; i<size; i++){ table_i = table[i]; ierr = PetscBTMemzero(Mbs,table_i);CHKERRQ(ierr); for (j = Bowners[i]; j<Bowners[i+1]; j++){ /* go through B cols of proc[i]*/ ierr = PetscBTSet(table_i,idx_i[j]);CHKERRQ(ierr); } } ierr = ISRestoreIndices(garray_gl,&idx_i);CHKERRQ(ierr); } /* if (is_max) */ ierr = ISDestroy(garray_gl);CHKERRQ(ierr); /* evaluate communication - mesg to who, length, and buffer space */ for (i=0; i<size; i++) len_s[i] = 0; /* header of data1 */ for (proc_id=0; proc_id<size; proc_id++){ iwork[proc_id] = 0; *data1_start[proc_id] = is_max; data1_start[proc_id]++; for (j=0; j<is_max; j++) { if (proc_id == rank){ *data1_start[proc_id] = n[j]; } else { *data1_start[proc_id] = 0; } data1_start[proc_id]++; } } for (i=0; i<is_max; i++) { ierr = ISGetIndices(is[i],&idx_i);CHKERRQ(ierr); for (j=0; j<n[i]; j++){ idx = idx_i[j]; *data1_start[rank] = idx; data1_start[rank]++; /* for local proccessing */ proc_end = ctable[idx]; for (proc_id=0; proc_id<=proc_end; proc_id++){ /* for others to process */ if (proc_id == rank ) continue; /* done before this loop */ if (proc_id < proc_end && !PetscBTLookup(table[proc_id],idx)) continue; /* no need for sending idx to [proc_id] */ *data1_start[proc_id] = idx; data1_start[proc_id]++; len_s[proc_id]++; } } /* update header data */ for (proc_id=0; proc_id<size; proc_id++){ if (proc_id== rank) continue; *(data1 + proc_id*len + 1 + i) = len_s[proc_id] - iwork[proc_id]; iwork[proc_id] = len_s[proc_id] ; } ierr = ISRestoreIndices(is[i],&idx_i);CHKERRQ(ierr); } nrqs = 0; nrqr = 0; for (i=0; i<size; i++){ data1_start[i] = data1 + i*len; if (len_s[i]){ nrqs++; len_s[i] += 1 + is_max; /* add no. of header msg */ } } for (i=0; i<is_max; i++) { ierr = ISDestroy(is[i]);CHKERRQ(ierr); } ierr = PetscFree(n);CHKERRQ(ierr); ierr = PetscFree(ctable);CHKERRQ(ierr); /* Determine the number of messages to expect, their lengths, from from-ids */ ierr = PetscGatherNumberOfMessages(comm,PETSC_NULL,len_s,&nrqr);CHKERRQ(ierr); ierr = PetscGatherMessageLengths(comm,nrqs,nrqr,len_s,&id_r1,&len_r1);CHKERRQ(ierr); /* Now post the sends */ ierr = PetscMalloc2(size,MPI_Request,&s_waits1,size,MPI_Request,&s_waits2);CHKERRQ(ierr); k = 0; for (proc_id=0; proc_id<size; proc_id++){ /* send data1 to processor [proc_id] */ if (len_s[proc_id]){ ierr = MPI_Isend(data1_start[proc_id],len_s[proc_id],MPIU_INT,proc_id,tag1,comm,s_waits1+k);CHKERRQ(ierr); k++; } } /* 2. Receive other's is[] and process. Then send back */ /*-----------------------------------------------------*/ len = 0; for (i=0; i<nrqr; i++){ if (len_r1[i] > len)len = len_r1[i]; } ierr = PetscFree(len_r1);CHKERRQ(ierr); ierr = PetscFree(id_r1);CHKERRQ(ierr); for (proc_id=0; proc_id<size; proc_id++) len_s[proc_id] = iwork[proc_id] = 0; ierr = PetscMalloc((len+1)*sizeof(PetscInt),&odata1);CHKERRQ(ierr); ierr = PetscMalloc(size*sizeof(PetscInt**),&odata2_ptr);CHKERRQ(ierr); ierr = PetscBTCreate(Mbs,otable);CHKERRQ(ierr); len_max = ois_max*(Mbs+1); /* max space storing all is[] for each receive */ len_est = 2*len_max; /* estimated space of storing is[] for all receiving messages */ ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); nodata2 = 0; /* nodata2+1: num of PetscMalloc(,&odata2_ptr[]) called */ odata2_ptr[nodata2] = odata2; len_unused = len_est; /* unused space in the array odata2_ptr[nodata2]-- needs to be >= len_max */ k = 0; while (k < nrqr){ /* Receive messages */ ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag1,comm,&flag,&r_status);CHKERRQ(ierr); if (flag){ ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); proc_id = r_status.MPI_SOURCE; ierr = MPI_Irecv(odata1,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); /* Process messages */ /* make sure there is enough unused space in odata2 array */ if (len_unused < len_max){ /* allocate more space for odata2 */ ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); odata2_ptr[++nodata2] = odata2; len_unused = len_est; } ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,odata1,OTHER,odata2,&otable);CHKERRQ(ierr); len = 1 + odata2[0]; for (i=0; i<odata2[0]; i++){ len += odata2[1 + i]; } /* Send messages back */ ierr = MPI_Isend(odata2,len,MPIU_INT,proc_id,tag2,comm,s_waits2+k);CHKERRQ(ierr); k++; odata2 += len; len_unused -= len; len_s[proc_id] = len; /* num of messages sending back to [proc_id] by this proc */ } } ierr = PetscFree(odata1);CHKERRQ(ierr); ierr = PetscBTDestroy(otable);CHKERRQ(ierr); /* 3. Do local work on this processor's is[] */ /*-------------------------------------------*/ /* make sure there is enough unused space in odata2(=data) array */ len_max = is_max*(Mbs+1); /* max space storing all is[] for this processor */ if (len_unused < len_max){ /* allocate more space for odata2 */ ierr = PetscMalloc((len_est+1)*sizeof(PetscInt),&odata2);CHKERRQ(ierr); odata2_ptr[++nodata2] = odata2; len_unused = len_est; } data = odata2; ierr = MatIncreaseOverlap_MPISBAIJ_Local(C,data1_start[rank],MINE,data,table);CHKERRQ(ierr); ierr = PetscFree(data1_start);CHKERRQ(ierr); /* 4. Receive work done on other processors, then merge */ /*------------------------------------------------------*/ /* get max number of messages that this processor expects to recv */ ierr = MPI_Allreduce(len_s,iwork,size,MPIU_INT,MPI_MAX,comm);CHKERRQ(ierr); ierr = PetscMalloc((iwork[rank]+1)*sizeof(PetscInt),&data2);CHKERRQ(ierr); ierr = PetscFree4(len_s,btable,iwork,Bowners);CHKERRQ(ierr); k = 0; while (k < nrqs){ /* Receive messages */ ierr = MPI_Iprobe(MPI_ANY_SOURCE,tag2,comm,&flag,&r_status); if (flag){ ierr = MPI_Get_count(&r_status,MPIU_INT,&len);CHKERRQ(ierr); proc_id = r_status.MPI_SOURCE; ierr = MPI_Irecv(data2,len,MPIU_INT,proc_id,r_status.MPI_TAG,comm,&r_req);CHKERRQ(ierr); ierr = MPI_Wait(&r_req,&r_status);CHKERRQ(ierr); if (len > 1+is_max){ /* Add data2 into data */ data2_i = data2 + 1 + is_max; for (i=0; i<is_max; i++){ table_i = table[i]; data_i = data + 1 + is_max + Mbs*i; isz = data[1+i]; for (j=0; j<data2[1+i]; j++){ col = data2_i[j]; if (!PetscBTLookupSet(table_i,col)) {data_i[isz++] = col;} } data[1+i] = isz; if (i < is_max - 1) data2_i += data2[1+i]; } } k++; } } ierr = PetscFree(data2);CHKERRQ(ierr); ierr = PetscFree2(table,t_p);CHKERRQ(ierr); /* phase 1 sends are complete */ ierr = PetscMalloc(size*sizeof(MPI_Status),&s_status);CHKERRQ(ierr); if (nrqs) {ierr = MPI_Waitall(nrqs,s_waits1,s_status);CHKERRQ(ierr);} ierr = PetscFree(data1);CHKERRQ(ierr); /* phase 2 sends are complete */ if (nrqr){ierr = MPI_Waitall(nrqr,s_waits2,s_status);CHKERRQ(ierr);} ierr = PetscFree2(s_waits1,s_waits2);CHKERRQ(ierr); ierr = PetscFree(s_status);CHKERRQ(ierr); /* 5. Create new is[] */ /*--------------------*/ for (i=0; i<is_max; i++) { data_i = data + 1 + is_max + Mbs*i; ierr = ISCreateGeneral(PETSC_COMM_SELF,data[1+i],data_i,is+i);CHKERRQ(ierr); } for (k=0; k<=nodata2; k++){ ierr = PetscFree(odata2_ptr[k]);CHKERRQ(ierr); } ierr = PetscFree(odata2_ptr);CHKERRQ(ierr); PetscFunctionReturn(0); }
void pzgstrs(int_t n, LUstruct_t *LUstruct, ScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, doublecomplex *B, int_t m_loc, int_t fst_row, int_t ldb, int nrhs, SOLVEstruct_t *SOLVEstruct, SuperLUStat_t *stat, int *info) { /* * Purpose * ======= * * PZGSTRS solves a system of distributed linear equations * A*X = B with a general N-by-N matrix A using the LU factorization * computed by PZGSTRF. * If the equilibration, and row and column permutations were performed, * the LU factorization was performed for A1 where * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * and the linear system solved is * A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and * the permutation to B1 by Pc*Pr is applied internally in this routine. * * Arguments * ========= * * n (input) int (global) * The order of the system of linear equations. * * LUstruct (input) LUstruct_t* * The distributed data structures storing L and U factors. * The L and U factors are obtained from PZGSTRF for * the possibly scaled and permuted matrix A. * See superlu_zdefs.h for the definition of 'LUstruct_t'. * A may be scaled and permuted into A1, so that * A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_defs.h for the definition of 'gridinfo_t'. * * B (input/output) doublecomplex* * On entry, the distributed right-hand side matrix of the possibly * equilibrated system. That is, B may be overwritten by diag(R)*B. * On exit, the distributed solution matrix Y of the possibly * equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X, * and X is the solution of the original system. * * m_loc (input) int (local) * The local row dimension of matrix B. * * fst_row (input) int (global) * The row number of B's first row in the global matrix. * * ldb (input) int (local) * The leading dimension of matrix B. * * nrhs (input) int (global) * Number of right-hand sides. * * SOLVEstruct (output) SOLVEstruct_t* (global) * Contains the information for the communication during the * solution phase. * * stat (output) SuperLUStat_t* * Record the statistics about the triangular solves. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * */ Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; doublecomplex alpha = {1.0, 0.0}; doublecomplex zero = {0.0, 0.0}; doublecomplex *lsum; /* Local running sum of the updates to B-components */ doublecomplex *x; /* X component at step k. */ /* NOTE: x and lsum are of same size. */ doublecomplex *lusup, *dest; doublecomplex *recvbuf, *tempv; doublecomplex *rtemp; /* Result of full matrix-vector multiply. */ int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ int_t iam, kcol, krow, mycol, myrow; int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; int_t nb, nlb, nub, nsupers; int_t *xsup, *supno, *lsub, *usub; int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ int_t Pc, Pr; int knsupc, nsupr; int ldalsum; /* Number of lsum entries locally owned. */ int maxrecvsz, p, pi; int_t **Lrowind_bc_ptr; doublecomplex **Lnzval_bc_ptr; MPI_Status status; #ifdef ISEND_IRECV MPI_Request *send_req, recv_req; #endif pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; /*-- Counts used for L-solve --*/ int_t *fmod; /* Modification count for L-solve -- Count the number of local block products to be summed into lsum[lk]. */ int_t **fsendx_plist = Llu->fsendx_plist; int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ int_t *frecv; /* Count of lsum[lk] contributions to be received from processes in this row. It is only valid on the diagonal processes. */ int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ int_t nleaf = 0, nroot = 0; /*-- Counts used for U-solve --*/ int_t *bmod; /* Modification count for U-solve. */ int_t **bsendx_plist = Llu->bsendx_plist; int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ int_t *brecv; /* Count of modifications to be recv'd from processes in this row. */ int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ double t; #if ( DEBUGlevel>=2 ) int_t Ublocks = 0; #endif t = SuperLU_timer_(); /* Test input parameters. */ *info = 0; if ( n < 0 ) *info = -1; else if ( nrhs < 0 ) *info = -9; if ( *info ) { pxerbla("PZGSTRS", grid, -*info); return; } /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); xsup = Glu_persist->xsup; supno = Glu_persist->supno; nsupers = supno[n-1] + 1; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pzgstrs()"); #endif stat->ops[SOLVE] = 0.0; Llu->SolveMsgSent = 0; /* Save the count to be altered so it can be used by subsequent call to PDGSTRS. */ if ( !(fmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for fmod[]."); for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; if ( !(frecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for frecv[]."); Llu->frecv = frecv; #ifdef ISEND_IRECV k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) ABORT("Malloc fails for send_req[]."); #endif #ifdef _CRAY ftcs1 = _cptofcd("L", strlen("L")); ftcs2 = _cptofcd("N", strlen("N")); ftcs3 = _cptofcd("U", strlen("U")); #endif /* Obtain ilsum[] and ldalsum for process column 0. */ ilsum = Llu->ilsum; ldalsum = Llu->ldalsum; /* Allocate working storage. */ knsupc = sp_ienv_dist(3); maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); if ( !(lsum = doublecomplexCalloc_dist(((size_t)ldalsum)*nrhs + nlb*LSUM_H)) ) ABORT("Calloc fails for lsum[]."); if ( !(x = doublecomplexMalloc_dist(ldalsum * nrhs + nlb * XK_H)) ) ABORT("Malloc fails for x[]."); if ( !(recvbuf = doublecomplexMalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for recvbuf[]."); if ( !(rtemp = doublecomplexCalloc_dist(maxrecvsz)) ) ABORT("Malloc fails for rtemp[]."); /*--------------------------------------------------- * Forward solve Ly = b. *---------------------------------------------------*/ /* Redistribute B into X on the diagonal processes. */ pzReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, ScalePermstruct, Glu_persist, grid, SOLVEstruct); /* Set up the headers in lsum[]. */ ii = 0; for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ il = LSUM_BLK( lk ); lsum[il - LSUM_H].r = k;/* Block number prepended in the header.*/ lsum[il - LSUM_H].i = 0; } ii += knsupc; } /* * Compute frecv[] and nfrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && fmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nfrecvmod += frecv[lk]; if ( !frecv[lk] && !fmod[lk] ) ++nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); assert( frecv[lk] < Pc ); #endif } } } } /* --------------------------------------------------------- Solve the leaf nodes first by all the diagonal processes. --------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nleaf %4d\n", iam, nleaf); #endif for (k = 0; k < nsupers && nleaf; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); if ( frecv[lk]==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ ii = X_BLK( lk ); lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ --nleaf; #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } } /* if diagonal process ... */ } /* for k ... */ /* ----------------------------------------------------------- Compute the internal nodes asynchronously by all processes. ----------------------------------------------------------- */ #if ( DEBUGlevel>=2 ) printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", iam, nfrecvx, nfrecvmod, nleaf); #endif while ( nfrecvx || nfrecvmod ) { /* While not finished. */ /* Receive a message. */ #ifdef ISEND_IRECV /* -MPI- FATAL: Remote protocol queue full */ MPI_Irecv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &recv_req ); MPI_Wait( &recv_req, &status ); #else MPI_Recv( recvbuf, maxrecvsz, SuperLU_MPI_DOUBLE_COMPLEX, MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); #endif k = (*recvbuf).r; #if ( DEBUGlevel>=2 ) printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); #endif switch ( status.MPI_TAG ) { case Xk: --nfrecvx; lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; if ( lsub ) { nb = lsub[0]; lptr = BC_HEADER; luptr = 0; knsupc = SuperSize( k ); /* * Perform local block modifications: lsum[i] -= L_i,k * X[k] */ zlsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if lsub */ break; case LSUM: /* Receiver must be a diagonal process */ --nfrecvmod; lk = LBi( k, grid ); /* Local block number, row-wise. */ ii = X_BLK( lk ); knsupc = SuperSize( k ); tempv = &recvbuf[LSUM_H]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) z_add(&x[i + ii + j*knsupc], &x[i + ii + j*knsupc], &tempv[i + j*knsupc]); } if ( (--frecv[lk])==0 && fmod[lk]==0 ) { fmod[lk] = -1; /* Do not solve X[k] in the future. */ lk = LBj( k, grid ); /* Local block number, column-wise. */ lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; nsupr = lsub[1]; #ifdef _CRAY CTRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #elif defined (USE_VENDOR_BLAS) ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); #else ztrsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, lusup, &nsupr, &x[ii], &knsupc); #endif stat->ops[SOLVE] += 4 * knsupc * (knsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, k); #endif /* * Send Xk to process column Pc[k]. */ kcol = PCOL( k, grid ); for (p = 0; p < Pr; ++p) { if ( fsendx_plist[lk][p] != EMPTY ) { pi = PNUM( p, kcol, grid ); #ifdef ISEND_IRECV MPI_Isend( &x[ii-XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm, &send_req[Llu->SolveMsgSent++]); #else MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, SuperLU_MPI_DOUBLE_COMPLEX, pi, Xk, grid->comm ); #endif #if ( DEBUGlevel>=2 ) printf("(%2d) Sent X[%2.0f] to P %2d\n", iam, x[ii-XK_H], pi); #endif } } /* * Perform local block modifications. */ nb = lsub[0] - 1; lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; luptr = knsupc; /* Skip diagonal block L(k,k). */ zlsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, fmod, nb, lptr, luptr, xsup, grid, Llu, send_req, stat); } /* if */ break; #if ( DEBUGlevel>=2 ) default: printf("(%2d) Recv'd wrong message tag %4d\n", status.MPI_TAG); break; #endif } /* switch */ } /* while not finished ... */ #if ( PRNTlevel>=2 ) t = SuperLU_timer_() - t; if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); t = SuperLU_timer_(); #endif #if ( DEBUGlevel==2 ) { printf("(%d) .. After L-solve: y =\n", iam); for (i = 0, k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ knsupc = SuperSize( k ); lk = LBi( k, grid ); ii = X_BLK( lk ); for (j = 0; j < knsupc; ++j) printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); fflush(stdout); } MPI_Barrier( grid->comm ); } } #endif SUPERLU_FREE(fmod); SUPERLU_FREE(frecv); SUPERLU_FREE(rtemp); #ifdef ISEND_IRECV for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); Llu->SolveMsgSent = 0; #endif /*--------------------------------------------------- * Back solve Ux = y. * * The Y components from the forward solve is already * on the diagonal processes. *---------------------------------------------------*/ /* Save the count to be altered so it can be used by subsequent call to PZGSTRS. */ if ( !(bmod = intMalloc_dist(nlb)) ) ABORT("Calloc fails for bmod[]."); for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; if ( !(brecv = intMalloc_dist(nlb)) ) ABORT("Malloc fails for brecv[]."); Llu->brecv = brecv; /* * Compute brecv[] and nbrecvmod counts on the diagonal processes. */ { superlu_scope_t *scp = &grid->rscp; for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { lk = LBi( k, grid ); /* Local block number. */ kcol = PCOL( k, grid ); /* Root process in this row scope. */ if ( mycol != kcol && bmod[lk] ) i = 1; /* Contribution from non-diagonal process. */ else i = 0; MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, MPI_SUM, kcol, scp->comm ); if ( mycol == kcol ) { /* Diagonal process. */ nbrecvmod += brecv[lk]; if ( !brecv[lk] && !bmod[lk] ) ++nroot; #if ( DEBUGlevel>=2 ) printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); assert( brecv[lk] < Pc ); #endif } } } } /* Re-initialize lsum to zero. Each block header is already in place. */ for (k = 0; k < nsupers; ++k) { krow = PROW( k, grid ); if ( myrow == krow ) { knsupc = SuperSize( k ); lk = LBi( k, grid ); il = LSUM_BLK( lk ); dest = &lsum[il]; RHS_ITERATE(j) { for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = zero; } } }
int main( int argc, char *argv[] ) { int msglen, i; int msglen_min = MIN_MESSAGE_LENGTH; int msglen_max = MAX_MESSAGE_LENGTH; int rank,poolsize,Master; char *sendbuf,*recvbuf; char ival; MPI_Request request; MPI_Status status; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&poolsize); MPI_Comm_rank(MPI_COMM_WORLD,&rank); if(poolsize != 2) { printf("Expected exactly 2 MPI processes\n"); MPI_Abort( MPI_COMM_WORLD, 1 ); } /* The following test allows this test to run on small-memory systems that support the sysconf call interface. This test keeps the test from becoming swap-bound. For example, on an old Linux system or a Sony Playstation 2 (really!) */ #if defined(HAVE_SYSCONF) && defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) { long n_pages, pagesize; int actmsglen_max; n_pages = sysconf( _SC_PHYS_PAGES ); pagesize = sysconf( _SC_PAGESIZE ); /* We want to avoid integer overflow in the size calculation. The best way is to avoid computing any products (such as total memory = n_pages * pagesize) and instead compute a msglen_max that fits within 1/4 of the available pages */ if (n_pages > 0 && pagesize > 0) { /* Recompute msglen_max */ int msgpages = 4 * ((msglen_max + pagesize - 1)/ pagesize); while (n_pages < msgpages) { msglen_max /= 2; msgpages /= 2; } } /* printf ( "before = %d\n", msglen_max ); */ MPI_Allreduce( &msglen_max, &actmsglen_max, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD ); msglen_max = actmsglen_max; /* printf ( "after = %d\n", msglen_max ); */ } #endif Master = (rank == 0); if(Master && verbose) printf("Size (bytes)\n------------\n"); for(msglen = msglen_min; msglen <= msglen_max; msglen *= 2) { sendbuf = malloc(msglen); recvbuf = malloc(msglen); if(sendbuf == NULL || recvbuf == NULL) { printf("Can't allocate %d bytes\n",msglen); MPI_Abort( MPI_COMM_WORLD, 1 ); } ival = 0; for (i=0; i<msglen; i++) { sendbuf[i] = ival++; recvbuf[i] = 0; } if(Master && verbose) printf("%d\n",msglen); fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); /* Send/Recv */ if(Master) MPI_Send(sendbuf,msglen,MPI_CHAR,1,TAG1,MPI_COMM_WORLD); else { Resetbuf( recvbuf, msglen ); MPI_Recv(recvbuf,msglen,MPI_CHAR,0,TAG1,MPI_COMM_WORLD,&status); Checkbuf( recvbuf, msglen, &status ); } MPI_Barrier(MPI_COMM_WORLD); /* Ssend/Recv */ if(Master) MPI_Ssend(sendbuf,msglen,MPI_CHAR,1,TAG2,MPI_COMM_WORLD); else { Resetbuf( recvbuf, msglen ); MPI_Recv(recvbuf,msglen,MPI_CHAR,0,TAG2,MPI_COMM_WORLD,&status); Checkbuf( recvbuf, msglen, &status ); } MPI_Barrier(MPI_COMM_WORLD); /* Rsend/Recv */ if (Master) { MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, TAGSR, MPI_BOTTOM, 0, MPI_INT, 1, TAGSR, MPI_COMM_WORLD, &status ); MPI_Rsend( sendbuf,msglen,MPI_CHAR,1,TAG3,MPI_COMM_WORLD ); } else { Resetbuf( recvbuf, msglen ); MPI_Irecv( recvbuf,msglen,MPI_CHAR,0,TAG3,MPI_COMM_WORLD,&request); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, TAGSR, MPI_BOTTOM, 0, MPI_INT, 0, TAGSR, MPI_COMM_WORLD, &status ); MPI_Wait( &request, &status ); Checkbuf( recvbuf, msglen, &status ); } MPI_Barrier(MPI_COMM_WORLD); /* Isend/Recv - receive not ready */ if(Master) { MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, TAGSR, MPI_BOTTOM, 0, MPI_INT, 1, TAGSR, MPI_COMM_WORLD, &status ); MPI_Isend(sendbuf,msglen,MPI_CHAR,1,TAG4,MPI_COMM_WORLD, &request); MPI_Wait( &request, &status ); } else { Resetbuf( recvbuf, msglen ); MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, TAGSR, MPI_BOTTOM, 0, MPI_INT, 0, TAGSR, MPI_COMM_WORLD, &status ); MPI_Recv(recvbuf,msglen,MPI_CHAR,0,TAG4,MPI_COMM_WORLD,&status); Checkbuf( recvbuf, msglen, &status ); } MPI_Barrier(MPI_COMM_WORLD); free(sendbuf); free(recvbuf); } if (rank == 0) { /* If we do not abort, we saw no errors */ printf( " No Errors\n" ); } MPI_Finalize(); return 0; }
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int my_ID; /* MPI rank */ int my_IDx, my_IDy; /* coordinates of rank in rank grid */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; int n, width, height;/* linear global and local grid dimension */ long nsquare; /* total number of grid points */ int i, j, ii, jj, kk, it, jt, iter, leftover; /* dummies */ int istart, iend; /* bounds of grid tile assigned to calling rank */ int jstart, jend; /* bounds of grid tile assigned to calling rank */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ int nthread_input, /* thread parameters */ nthread; DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+OPENMP stencil execution on 2D grid\n"); #ifndef STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#threads><#iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } /* Take number of threads to request from command line */ nthread_input = atoi(*++argv); if ((nthread_input < 1) || (nthread_input > MAX_THREADS)) { printf("ERROR: Invalid number of threads: %d\n", nthread_input); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 1){ printf("ERROR: iterations must be >= 1 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atoi(*++argv); nsquare = (long) n * (long) n; if (nsquare < Num_procs){ printf("ERROR: grid size %ld must be at least # ranks: %d\n", nsquare, Num_procs); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %d\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; break; } } my_IDx = my_ID%Num_procsx; my_IDy = my_ID/Num_procsx; /* compute neighbors; don't worry about dropping off the edges of the grid */ right_nbr = my_ID+1; left_nbr = my_ID-1; top_nbr = my_ID+Num_procsx; bottom_nbr = my_ID-Num_procsx; MPI_Bcast(&n, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&nthread_input, 1, MPI_INT, root, MPI_COMM_WORLD); omp_set_num_threads(nthread_input); if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Number of threads = %d\n", omp_get_max_threads()); printf("Grid size = %d\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Type of stencil = star\n"); #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* compute amount of space required for input and solution arrays */ width = n/Num_procsx; leftover = n%Num_procsx; if (my_IDx<leftover) { istart = (width+1) * my_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_procsy; leftover = n%Num_procsy; if (my_IDy<leftover) { jstart = (height+1) * my_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius\n", my_ID); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); if (total_length_in/(height+2*RADIUS) != (width+2*RADIUS)*sizeof(DTYPE)) { printf("ERROR: Space for %d x %d input array cannot be represented\n", width+2*RADIUS, height+2*RADIUS); error = 1; } bail_out(error); total_length_out = width*height*sizeof(DTYPE); in = (DTYPE *) prk_malloc(total_length_in); out = (DTYPE *) prk_malloc(total_length_out); if (!in || !out) { printf("ERROR: rank %d could not allocate space for input/output array\n", my_ID); error = 1; } bail_out(error); /* fill the stencil weights to reflect a discrete divergence operator */ for (jj=-RADIUS; jj<=RADIUS; jj++) for (ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ #pragma omp parallel for private (i) for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width; bottom_buf_out = top_buf_out + 2*RADIUS*width; bottom_buf_in = top_buf_out + 3*RADIUS*width; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height; left_buf_out = right_buf_out + 2*RADIUS*height; left_buf_in = right_buf_out + 3*RADIUS*height; for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (my_IDy < Num_procsy-1) { MPI_Irecv(top_buf_in, RADIUS*width, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (kk=0,j=jend-RADIUS+1; j<=jend; j++) for (i=istart; i<=iend; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (my_IDy > 0) { MPI_Irecv(bottom_buf_in,RADIUS*width, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (kk=0,j=jstart; j<=jstart+RADIUS-1; j++) for (i=istart; i<=iend; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (my_IDy < Num_procsy-1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (kk=0,j=jend+1; j<=jend+RADIUS; j++) for (i=istart; i<=iend; i++) { IN(i,j) = top_buf_in[kk++]; } } if (my_IDy > 0) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (kk=0,j=jstart-RADIUS; j<=jstart-1; j++) for (i=istart; i<=iend; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* need to fetch ghost point data from neighbors in x-direction */ if (my_IDx < Num_procsx-1) { MPI_Irecv(right_buf_in, RADIUS*height, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (kk=0,j=jstart; j<=jend; j++) for (i=iend-RADIUS+1; i<=iend; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (my_IDx > 0) { MPI_Irecv(left_buf_in, RADIUS*height, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (kk=0,j=jstart; j<=jend; j++) for (i=istart; i<=istart+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (my_IDx < Num_procsx-1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (kk=0,j=jstart; j<=jend; j++) for (i=iend+1; i<=iend+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (my_IDx > 0) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (kk=0,j=jstart; j<=jend; j++) for (i=istart-RADIUS; i<=istart-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* Apply the stencil operator */ #pragma omp parallel for private (i, j, ii, jj) for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } #pragma omp parallel for private (i) /* add constant to solution to force refresh of neighbor data, if any */ for (j=jstart; j<=jend; j++) for (i=istart; i<=iend; i++) IN(i,j)+= 1.0; } local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; #pragma omp parallel for reduction(+:local_norm) private (i) for (j=MAX(jstart,RADIUS); j<=MIN(n-RADIUS-1,jend); j++) { for (i=MAX(istart,RADIUS); i<=MIN(n-RADIUS-1,iend); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
int main(int argc, char *argv[]) { int rank, nproc, i; int errors = 0, all_errors = 0; int *buf = NULL, *winbuf = NULL; MPI_Win window; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (nproc < 2) { if (rank == 0) printf("Error: must be run with two or more processes\n"); MPI_Abort(MPI_COMM_WORLD, 1); } MPI_Alloc_mem(MAX_SIZE * sizeof(int), MPI_INFO_NULL, &buf); MPI_Alloc_mem(MAX_SIZE * sizeof(int), MPI_INFO_NULL, &winbuf); MPI_Win_create(winbuf, MAX_SIZE * sizeof(int), sizeof(int), MPI_INFO_NULL, MPI_COMM_WORLD, &window); MPI_Win_lock_all(0, window); /* Test Raccumulate local completion with small data. * Small data is always copied to header packet as immediate data. */ if (rank == 1) { for (i = 0; i < ITER; i++) { MPI_Request acc_req; int val = -1; buf[0] = rank * i; MPI_Raccumulate(&buf[0], 1, MPI_INT, 0, 0, 1, MPI_INT, MPI_MAX, window, &acc_req); MPI_Wait(&acc_req, MPI_STATUS_IGNORE); /* reset local buffer to check local completion */ buf[0] = 0; MPI_Win_flush(0, window); MPI_Get(&val, 1, MPI_INT, 0, 0, 1, MPI_INT, window); MPI_Win_flush(0, window); if (val != rank * i) { printf("%d - Got %d in small Raccumulate test, expected %d (%d * %d)\n", rank, val, rank * i, rank, i); errors++; } } } MPI_Barrier(MPI_COMM_WORLD); /* Test Raccumulate local completion with large data . * Large data is not suitable for 1-copy optimization, and always sent out * from user buffer. */ if (rank == 1) { for (i = 0; i < ITER; i++) { MPI_Request acc_req; int val0 = -1, val1 = -1, val2 = -1; int j; /* initialize data */ for (j = 0; j < MAX_SIZE; j++) { buf[j] = rank + j + i; } MPI_Raccumulate(buf, MAX_SIZE, MPI_INT, 0, 0, MAX_SIZE, MPI_INT, MPI_REPLACE, window, &acc_req); MPI_Wait(&acc_req, MPI_STATUS_IGNORE); /* reset local buffer to check local completion */ buf[0] = 0; buf[MAX_SIZE - 1] = 0; buf[MAX_SIZE / 2] = 0; MPI_Win_flush(0, window); /* get remote values which are modified in local buffer after wait */ MPI_Get(&val0, 1, MPI_INT, 0, 0, 1, MPI_INT, window); MPI_Get(&val1, 1, MPI_INT, 0, MAX_SIZE - 1, 1, MPI_INT, window); MPI_Get(&val2, 1, MPI_INT, 0, MAX_SIZE / 2, 1, MPI_INT, window); MPI_Win_flush(0, window); if (val0 != rank + i) { printf("%d - Got %d in large Raccumulate test, expected %d\n", rank, val0, rank + i); errors++; } if (val1 != rank + MAX_SIZE - 1 + i) { printf("%d - Got %d in large Raccumulate test, expected %d\n", rank, val1, rank + MAX_SIZE - 1 + i); errors++; } if (val2 != rank + MAX_SIZE / 2 + i) { printf("%d - Got %d in large Raccumulate test, expected %d\n", rank, val2, rank + MAX_SIZE / 2 + i); errors++; } } } MPI_Win_unlock_all(window); MPI_Barrier(MPI_COMM_WORLD); MPI_Win_free(&window); if (buf) MPI_Free_mem(buf); if (winbuf) MPI_Free_mem(winbuf); MPI_Reduce(&errors, &all_errors, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && all_errors == 0) printf(" No Errors\n"); MPI_Finalize(); return 0; }
/* MP_quit disconnects current node from MP-System: * Parameters: * IN isError - error number, 0 if normal exit * Returns: Bool: success (1) or failure (0) * * MPI Version: MPI requires that all sent messages must be received * before quitting. Receive or cancel all pending messages (using msg. * count), then quit from MPI. */ rtsBool MP_quit(int isError) { StgWord data[2]; MPI_Request sysRequest2; data[0] = PP_FINISH; data[1] = isError; if (IAmMainThread) { int i; IF_PAR_DEBUG(mpcomm, debugBelch("Main PE stopping MPI system (exit code: %d)\n", isError)); // bcast FINISH to other PEs for (i=2; i<=(int)nPEs; i++) { // synchronous send operation in order 2..nPEs ... might slow down. MPI_Isend(&pingMessage, 1, MPI_INT, i-1, PP_FINISH, sysComm, &sysRequest2); MPI_Send(data,2*sizeof(StgWord),MPI_BYTE,i-1, PP_FINISH, MPI_COMM_WORLD); MPI_Wait(&sysRequest2, MPI_STATUS_IGNORE); } // receive answers from all children (just counting) while (finishRecvd < mpiWorldSize-1) { MPI_Recv(data, 2*sizeof(StgWord), MPI_BYTE, MPI_ANY_SOURCE, PP_FINISH, MPI_COMM_WORLD, &status); ASSERT(status.MPI_TAG == PP_FINISH); // and receive corresponding sysComm ping: MPI_Recv(&pingMessage, 1, MPI_INT, status.MPI_SOURCE, PP_FINISH, sysComm, MPI_STATUS_IGNORE); IF_PAR_DEBUG(mpcomm, debugBelch("Received FINISH reply from %d\n", status.MPI_SOURCE)); finishRecvd++; } } else { IF_PAR_DEBUG(mpcomm, debugBelch("Non-main PE stopping MPI system (exit code: %d)\n", isError)); // send FINISH to rank 0 MPI_Isend(&pingMessage, 1, MPI_INT, 0, PP_FINISH, sysComm, &sysRequest2); MPI_Send(data, 2*sizeof(StgWord), MPI_BYTE, 0, PP_FINISH, MPI_COMM_WORLD); // can omit: MPI_Wait(&sysRequest2, MPI_STATUS_IGNORE); // if non-main PE terminates first, await answer if (finishRecvd < 1) { MPI_Recv(data, 2*sizeof(StgWord), MPI_BYTE, 0, PP_FINISH, MPI_COMM_WORLD, MPI_STATUS_IGNORE); MPI_Recv(&pingMessage, 1, MPI_INT, 0, PP_FINISH, sysComm, MPI_STATUS_IGNORE); finishRecvd++; } } // TODO: receive or cancel all pending messages... /* ------------------------------------------------ *q&d solution: * receive anything retrievable by MPI_Probe * then get in sync * then again receive remaining messages * * (since buffering is used, and buffers are detached to force * messages, a PE might get stuck detaching its mpiMsgBuffer, and * send another message as soon as buffer space is available again. * The other PEs will not ) * * ---------------------------------------------- */ { // allocate fresh buffer to avoid overflow void* voidbuffer; int voidsize; // we might come here because of requesting too much buffer (bug!) voidsize = (INT_MAX / sizeof(StgWord) < DATASPACEWORDS)?\ INT_MAX : DATASPACEWORDS * sizeof(StgWord); voidbuffer = (void*) stgMallocBytes(voidsize, "voidBuffer"); // receive whatever is out there... while (MP_probe()) { MPI_Recv(voidbuffer, voidsize, MPI_BYTE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); if (ISSYSCODE(status.MPI_TAG)) MPI_Recv(voidbuffer, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, sysComm, MPI_STATUS_IGNORE); } MPI_Barrier(MPI_COMM_WORLD); // all in sync (noone sends further messages), receive rest while (MP_probe()) { MPI_Recv(voidbuffer, voidsize, MPI_BYTE, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status); if (ISSYSCODE(status.MPI_TAG)) MPI_Recv(voidbuffer, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, sysComm, MPI_STATUS_IGNORE); } stgFree(voidbuffer); } // end of q&d IF_PAR_DEBUG(mpcomm, debugBelch("detaching MPI buffer\n")); stgFree(mpiMsgBuffer); IF_PAR_DEBUG(mpcomm, debugBelch("Goodbye\n")); MPI_Finalize(); /* indicate that quit has been executed */ nPEs = 0; return rtsTrue; }
/** Perform communication according to the parallelization scheme * described by the halo communicator * @param hc halo communicator describing the parallelization scheme */ void halo_communication(HaloCommunicator *hc, void *base) { int n, comm_type, s_node, r_node; void *s_buffer, *r_buffer ; Fieldtype fieldtype; MPI_Datatype datatype; MPI_Request request; MPI_Status status; HALO_TRACE(fprintf(stderr, "%d: halo_comm base=%p num=%d\n", this_node, base, hc->num)) ; for (n = 0; n < hc->num; n++) { HALO_TRACE(fprintf(stderr, "%d: halo_comm round %d\n", this_node, n)) ; comm_type = hc->halo_info[n].type ; s_buffer = (char *)base + hc->halo_info[n].s_offset; r_buffer = (char *)base + hc->halo_info[n].r_offset; switch (comm_type) { case HALO_LOCL: fieldtype = hc->halo_info[n].fieldtype; halo_dtcopy(r_buffer,s_buffer,1,fieldtype); break ; case HALO_SENDRECV: datatype = hc->halo_info[n].datatype; s_node = hc->halo_info[n].source_node ; r_node = hc->halo_info[n].dest_node ; HALO_TRACE(fprintf(stderr,"%d: halo_comm sendrecv %d to %d (%d) (%p)\n",this_node,s_node,r_node,REQ_HALO_SPREAD,&datatype)); MPI_Sendrecv(s_buffer, 1, datatype, r_node, REQ_HALO_SPREAD, r_buffer, 1, datatype, s_node, REQ_HALO_SPREAD, MPI_COMM_WORLD, &status); break ; case HALO_SEND: datatype = hc->halo_info[n].datatype; fieldtype = hc->halo_info[n].fieldtype; s_node = hc->halo_info[n].source_node ; r_node = hc->halo_info[n].dest_node ; HALO_TRACE(fprintf(stderr,"%d: halo_comm send to %d.\n",this_node,r_node)); MPI_Isend(s_buffer, 1, datatype, r_node, REQ_HALO_SPREAD, MPI_COMM_WORLD, &request); halo_dtset(r_buffer,0,fieldtype); MPI_Wait(&request,&status); break; case HALO_RECV: datatype = hc->halo_info[n].datatype; s_node = hc->halo_info[n].source_node ; r_node = hc->halo_info[n].dest_node ; HALO_TRACE(fprintf(stderr,"%d: halo_comm recv from %d.\n",this_node,s_node)); MPI_Irecv(r_buffer, 1, datatype, s_node, REQ_HALO_SPREAD, MPI_COMM_WORLD, &request); MPI_Wait(&request,&status); break; case HALO_OPEN: fieldtype = hc->halo_info[n].fieldtype; HALO_TRACE(fprintf(stderr,"%d: halo_comm open boundaries\n",this_node)); /* \todo this does not work for the n_i - <n_i> */ halo_dtset(r_buffer,0,fieldtype); break; } } }
SEXP spmd_wait(SEXP R_request, SEXP R_status){ spmd_errhandler( MPI_Wait(&request[INTEGER(R_request)[0]], &status[INTEGER(R_status)[0]])); return(R_NilValue); } /* End of spmd_wait(). */
int main(int argc, char ** argv) { int Num_procs; /* number of ranks */ int Num_procsx, Num_procsy; /* number of ranks in each coord direction */ int Num_groupsx, Num_groupsy; /* number of blocks in each coord direction */ int my_group; /* sequence number of shared memory block */ int my_group_IDx, my_group_IDy; /* coordinates of block within block grid */ int group_size; /* number of ranks in shared memory group */ int group_sizex, group_sizey; /* number of ranks in block in each coord direction */ int my_ID; /* MPI rank */ int my_global_IDx, my_global_IDy; /* coordinates of rank in overall rank grid */ int my_local_IDx, my_local_IDy; /* coordinates of rank within shared memory block */ int right_nbr; /* global rank of right neighboring tile */ int left_nbr; /* global rank of left neighboring tile */ int top_nbr; /* global rank of top neighboring tile */ int bottom_nbr; /* global rank of bottom neighboring tile */ int local_nbr[4]; /* list of synchronizing local neighbors */ int num_local_nbrs; /* number of synchronizing local neighbors */ int dummy; DTYPE *top_buf_out; /* communication buffer */ DTYPE *top_buf_in; /* " " */ DTYPE *bottom_buf_out; /* " " */ DTYPE *bottom_buf_in; /* " " */ DTYPE *right_buf_out; /* " " */ DTYPE *right_buf_in; /* " " */ DTYPE *left_buf_out; /* " " */ DTYPE *left_buf_in; /* " " */ int root = 0; long n, width, height;/* linear global and block grid dimension */ int width_rank, height_rank; /* linear local dimension */ int iter, leftover; /* dummies */ int istart_rank, iend_rank; /* bounds of grid tile assigned to calling rank */ int jstart_rank, jend_rank; /* bounds of grid tile assigned to calling rank */ int istart, iend; /* bounds of grid block containing tile */ int jstart, jend; /* bounds of grid block containing tile */ DTYPE norm, /* L1 norm of solution */ local_norm, /* contribution of calling rank to L1 norm */ reference_norm; /* value to be matched by computed norm */ DTYPE f_active_points; /* interior of grid with respect to stencil */ DTYPE flops; /* floating point ops per iteration */ int iterations; /* number of times to run the algorithm */ double local_stencil_time,/* timing parameters */ stencil_time, avgtime; int stencil_size; /* number of points in stencil */ DTYPE * RESTRICT in; /* input grid values */ DTYPE * RESTRICT out; /* output grid values */ long total_length_in; /* total required length to store input array */ long total_length_out;/* total required length to store output array */ int error=0; /* error flag */ DTYPE weight[2*RADIUS+1][2*RADIUS+1]; /* weights of points in the stencil */ MPI_Request request[8]; /* requests for sends & receives in 4 coord directions */ MPI_Win shm_win_in; /* shared memory window object for IN array */ MPI_Win shm_win_out; /* shared memory window object for OUT array */ MPI_Comm shm_comm_prep; /* preparatory shared memory communicator */ MPI_Comm shm_comm; /* Shared Memory Communicator */ int shm_procs; /* # of rankes in shared domain */ int shm_ID; /* MPI rank in shared memory domain */ MPI_Aint size_in; /* size of the IN array in shared memory window */ MPI_Aint size_out; /* size of the OUT array in shared memory window */ int size_mul; /* one for shm_comm root, zero for the other ranks */ int disp_unit; /* ignored */ /******************************************************************************* ** Initialize the MPI environment ********************************************************************************/ MPI_Init(&argc,&argv); MPI_Comm_rank(MPI_COMM_WORLD, &my_ID); MPI_Comm_size(MPI_COMM_WORLD, &Num_procs); /******************************************************************************* ** process, test, and broadcast input parameters ********************************************************************************/ if (my_ID == root) { printf("Parallel Research Kernels version %s\n", PRKVERSION); printf("MPI+SHM stencil execution on 2D grid\n"); #if !STAR printf("ERROR: Compact stencil not supported\n"); error = 1; goto ENDOFTESTS; #endif if (argc != 4){ printf("Usage: %s <#ranks per coherence domain><# iterations> <array dimension> \n", *argv); error = 1; goto ENDOFTESTS; } group_size = atoi(*++argv); if (group_size < 1) { printf("ERROR: # ranks per coherence domain must be >= 1 : %d \n",group_size); error = 1; goto ENDOFTESTS; } if (Num_procs%group_size) { printf("ERROR: total # %d ranks not divisible by ranks per coherence domain %d\n", Num_procs, group_size); error = 1; goto ENDOFTESTS; } iterations = atoi(*++argv); if (iterations < 0){ printf("ERROR: iterations must be >= 0 : %d \n",iterations); error = 1; goto ENDOFTESTS; } n = atol(*++argv); long nsquare = n * n; if (nsquare < Num_procs){ printf("ERROR: grid size must be at least # ranks: %ld\n", nsquare); error = 1; goto ENDOFTESTS; } if (RADIUS < 0) { printf("ERROR: Stencil radius %d should be non-negative\n", RADIUS); error = 1; goto ENDOFTESTS; } if (2*RADIUS +1 > n) { printf("ERROR: Stencil radius %d exceeds grid size %ld\n", RADIUS, n); error = 1; goto ENDOFTESTS; } ENDOFTESTS:; } bail_out(error); MPI_Bcast(&n, 1, MPI_LONG, root, MPI_COMM_WORLD); MPI_Bcast(&iterations, 1, MPI_INT, root, MPI_COMM_WORLD); MPI_Bcast(&group_size, 1, MPI_INT, root, MPI_COMM_WORLD); /* determine best way to create a 2D grid of ranks (closest to square, for best surface/volume ratio); we do this brute force for now. The decomposition needs to be such that shared memory groups can evenly tessellate the rank grid */ for (Num_procsx=(int) (sqrt(Num_procs+1)); Num_procsx>0; Num_procsx--) { if (!(Num_procs%Num_procsx)) { Num_procsy = Num_procs/Num_procsx; for (group_sizex=(int)(sqrt(group_size+1)); group_sizex>0; group_sizex--) { if (!(group_size%group_sizex) && !(Num_procsx%group_sizex)) { group_sizey=group_size/group_sizex; break; } } if (!(Num_procsy%group_sizey)) break; } } if (my_ID == root) { printf("Number of ranks = %d\n", Num_procs); printf("Grid size = %ld\n", n); printf("Radius of stencil = %d\n", RADIUS); printf("Tiles in x/y-direction = %d/%d\n", Num_procsx, Num_procsy); printf("Tiles per shared memory domain = %d\n", group_size); printf("Tiles in x/y-direction in group = %d/%d\n", group_sizex, group_sizey); printf("Type of stencil = star\n"); #if LOCAL_BARRIER_SYNCH printf("Local synchronization = barrier\n"); #else printf("Local synchronization = point to point\n"); #endif #if DOUBLE printf("Data type = double precision\n"); #else printf("Data type = single precision\n"); #endif #if LOOPGEN printf("Script used to expand stencil loop body\n"); #else printf("Compact representation of stencil loop body\n"); #endif printf("Number of iterations = %d\n", iterations); } /* Setup for Shared memory regions */ /* first divide WORLD in groups of size group_size */ MPI_Comm_split(MPI_COMM_WORLD, my_ID/group_size, my_ID%group_size, &shm_comm_prep); /* derive from that an SHM communicator */ MPI_Comm_split_type(shm_comm_prep, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); MPI_Comm_rank(shm_comm, &shm_ID); MPI_Comm_size(shm_comm, &shm_procs); /* do sanity check, making sure groups did not shrink in second comm split */ if (shm_procs != group_size) MPI_Abort(MPI_COMM_WORLD, 666); Num_groupsx = Num_procsx/group_sizex; Num_groupsy = Num_procsy/group_sizey; my_group = my_ID/group_size; my_group_IDx = my_group%Num_groupsx; my_group_IDy = my_group/Num_groupsx; my_local_IDx = my_ID%group_sizex; my_local_IDy = (my_ID%group_size)/group_sizex; my_global_IDx = my_group_IDx*group_sizex+my_local_IDx; my_global_IDy = my_group_IDy*group_sizey+my_local_IDy; /* set all neighboring ranks to -1 (no communication with those ranks) */ left_nbr = right_nbr = top_nbr = bottom_nbr = -1; /* keep track of local neighbors for local synchronization */ num_local_nbrs = 0; if (my_local_IDx == group_sizex-1 && my_group_IDx != (Num_groupsx-1)) { right_nbr = (my_group+1)*group_size+shm_ID-group_sizex+1; } if (my_local_IDx != group_sizex-1) { local_nbr[num_local_nbrs++] = shm_ID + 1; } if (my_local_IDx == 0 && my_group_IDx != 0) { left_nbr = (my_group-1)*group_size+shm_ID+group_sizex-1; } if (my_local_IDx != 0) { local_nbr[num_local_nbrs++] = shm_ID - 1; } if (my_local_IDy == group_sizey-1 && my_group_IDy != (Num_groupsy-1)) { top_nbr = (my_group+Num_groupsx)*group_size + my_local_IDx; } if (my_local_IDy != group_sizey-1) { local_nbr[num_local_nbrs++] = shm_ID + group_sizex; } if (my_local_IDy == 0 && my_group_IDy != 0) { bottom_nbr = (my_group-Num_groupsx)*group_size + group_sizex*(group_sizey-1)+my_local_IDx; } if (my_local_IDy != 0) { local_nbr[num_local_nbrs++] = shm_ID - group_sizex; } /* compute amount of space required for input and solution arrays for the block, and also compute index sets */ width = n/Num_groupsx; leftover = n%Num_groupsx; if (my_group_IDx<leftover) { istart = (width+1) * my_group_IDx; iend = istart + width; } else { istart = (width+1) * leftover + width * (my_group_IDx-leftover); iend = istart + width - 1; } width = iend - istart + 1; if (width == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); height = n/Num_groupsy; leftover = n%Num_groupsy; if (my_group_IDy<leftover) { jstart = (height+1) * my_group_IDy; jend = jstart + height; } else { jstart = (height+1) * leftover + height * (my_group_IDy-leftover); jend = jstart + height - 1; } height = jend - jstart + 1; if (height == 0) { printf("ERROR: rank %d has no work to do\n", my_ID); error = 1; } bail_out(error); if (width < RADIUS || height < RADIUS) { printf("ERROR: rank %d has work tile smaller then stencil radius; w=%ld,h=%ld\n", my_ID, width, height); error = 1; } bail_out(error); total_length_in = (width+2*RADIUS)*(height+2*RADIUS)*sizeof(DTYPE); total_length_out = width*height*sizeof(DTYPE); /* only the root of each SHM domain specifies window of nonzero size */ size_mul = (shm_ID==0); size_in= total_length_in*size_mul; MPI_Win_allocate_shared(size_in, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &in, &shm_win_in); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_in); MPI_Win_shared_query(shm_win_in, MPI_PROC_NULL, &size_in, &disp_unit, (void *)&in); if (in == NULL){ printf("Error allocating space for input array by group %d\n",my_group); error = 1; } bail_out(error); size_out= total_length_out*size_mul; MPI_Win_allocate_shared(size_out, sizeof(double), MPI_INFO_NULL, shm_comm, (void *) &out, &shm_win_out); MPI_Win_lock_all(MPI_MODE_NOCHECK, shm_win_out); MPI_Win_shared_query(shm_win_out, MPI_PROC_NULL, &size_out, &disp_unit, (void *)&out); if (out == NULL){ printf("Error allocating space for output array by group %d\n", my_group); error = 1; } bail_out(error); /* determine index set assigned to each rank */ width_rank = width/group_sizex; leftover = width%group_sizex; if (my_local_IDx<leftover) { istart_rank = (width_rank+1) * my_local_IDx; iend_rank = istart_rank + width_rank; } else { istart_rank = (width_rank+1) * leftover + width_rank * (my_local_IDx-leftover); iend_rank = istart_rank + width_rank - 1; } istart_rank += istart; iend_rank += istart; width_rank = iend_rank - istart_rank + 1; height_rank = height/group_sizey; leftover = height%group_sizey; if (my_local_IDy<leftover) { jstart_rank = (height_rank+1) * my_local_IDy; jend_rank = jstart_rank + height_rank; } else { jstart_rank = (height_rank+1) * leftover + height_rank * (my_local_IDy-leftover); jend_rank = jstart_rank + height_rank - 1; } jstart_rank+=jstart; jend_rank+=jstart; height_rank = jend_rank - jstart_rank + 1; if (height_rank*width_rank==0) { error = 1; printf("Rank %d has no work to do\n", my_ID); } bail_out(error); /* allocate communication buffers for halo values */ top_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*width_rank); if (!top_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for y-direction\n", my_ID); error = 1; } bail_out(error); top_buf_in = top_buf_out + RADIUS*width_rank; bottom_buf_out = top_buf_out + 2*RADIUS*width_rank; bottom_buf_in = top_buf_out + 3*RADIUS*width_rank; right_buf_out = (DTYPE *) prk_malloc(4*sizeof(DTYPE)*RADIUS*height_rank); if (!right_buf_out) { printf("ERROR: Rank %d could not allocated comm buffers for x-direction\n", my_ID); error = 1; } bail_out(error); right_buf_in = right_buf_out + RADIUS*height_rank; left_buf_out = right_buf_out + 2*RADIUS*height_rank; left_buf_in = right_buf_out + 3*RADIUS*height_rank; /* fill the stencil weights to reflect a discrete divergence operator */ for (int jj=-RADIUS; jj<=RADIUS; jj++) for (int ii=-RADIUS; ii<=RADIUS; ii++) WEIGHT(ii,jj) = (DTYPE) 0.0; stencil_size = 4*RADIUS+1; for (int ii=1; ii<=RADIUS; ii++) { WEIGHT(0, ii) = WEIGHT( ii,0) = (DTYPE) (1.0/(2.0*ii*RADIUS)); WEIGHT(0,-ii) = WEIGHT(-ii,0) = -(DTYPE) (1.0/(2.0*ii*RADIUS)); } norm = (DTYPE) 0.0; f_active_points = (DTYPE) (n-2*RADIUS)*(DTYPE) (n-2*RADIUS); /* intialize the input and output arrays */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = COEFX*i+COEFY*j; OUT(i,j) = (DTYPE)0.0; } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); MPI_Win_sync(shm_win_out); MPI_Barrier(shm_comm); for (iter = 0; iter<=iterations; iter++){ /* start timer after a warmup iteration */ if (iter == 1) { MPI_Barrier(MPI_COMM_WORLD); local_stencil_time = wtime(); } /* need to fetch ghost point data from neighbors in y-direction */ if (top_nbr != -1) { MPI_Irecv(top_buf_in, RADIUS*width_rank, MPI_DTYPE, top_nbr, 101, MPI_COMM_WORLD, &(request[1])); for (int kk=0,j=jend_rank-RADIUS+1; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) { top_buf_out[kk++]= IN(i,j); } MPI_Isend(top_buf_out, RADIUS*width_rank,MPI_DTYPE, top_nbr, 99, MPI_COMM_WORLD, &(request[0])); } if (bottom_nbr != -1) { MPI_Irecv(bottom_buf_in,RADIUS*width_rank, MPI_DTYPE, bottom_nbr, 99, MPI_COMM_WORLD, &(request[3])); for (int kk=0,j=jstart_rank; j<=jstart_rank+RADIUS-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { bottom_buf_out[kk++]= IN(i,j); } MPI_Isend(bottom_buf_out, RADIUS*width_rank,MPI_DTYPE, bottom_nbr, 101, MPI_COMM_WORLD, &(request[2])); } if (top_nbr != -1) { MPI_Wait(&(request[0]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1]), MPI_STATUS_IGNORE); for (int kk=0,j=jend_rank+1; j<=jend_rank+RADIUS; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = top_buf_in[kk++]; } } if (bottom_nbr != -1) { MPI_Wait(&(request[2]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank-RADIUS; j<=jstart_rank-1; j++) for (int i=istart_rank; i<=iend_rank; i++) { IN(i,j) = bottom_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* need to fetch ghost point data from neighbors in x-direction */ if (right_nbr != -1) { MPI_Irecv(right_buf_in, RADIUS*height_rank, MPI_DTYPE, right_nbr, 1010, MPI_COMM_WORLD, &(request[1+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank-RADIUS+1; i<=iend_rank; i++) { right_buf_out[kk++]= IN(i,j); } MPI_Isend(right_buf_out, RADIUS*height_rank, MPI_DTYPE, right_nbr, 990, MPI_COMM_WORLD, &(request[0+4])); } if (left_nbr != -1) { MPI_Irecv(left_buf_in, RADIUS*height_rank, MPI_DTYPE, left_nbr, 990, MPI_COMM_WORLD, &(request[3+4])); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=istart_rank+RADIUS-1; i++) { left_buf_out[kk++]= IN(i,j); } MPI_Isend(left_buf_out, RADIUS*height_rank, MPI_DTYPE, left_nbr, 1010, MPI_COMM_WORLD, &(request[2+4])); } if (right_nbr != -1) { MPI_Wait(&(request[0+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[1+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=iend_rank+1; i<=iend_rank+RADIUS; i++) { IN(i,j) = right_buf_in[kk++]; } } if (left_nbr != -1) { MPI_Wait(&(request[2+4]), MPI_STATUS_IGNORE); MPI_Wait(&(request[3+4]), MPI_STATUS_IGNORE); for (int kk=0,j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank-RADIUS; i<=istart_rank-1; i++) { IN(i,j) = left_buf_in[kk++]; } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); /* Apply the stencil operator */ for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { #if LOOPGEN #include "loop_body_star.incl" #else for (int jj=-RADIUS; jj<=RADIUS; jj++) OUT(i,j) += WEIGHT(0,jj)*IN(i,j+jj); for (int ii=-RADIUS; ii<0; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); for (int ii=1; ii<=RADIUS; ii++) OUT(i,j) += WEIGHT(ii,0)*IN(i+ii,j); #endif } } /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_out); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid writing IN while other ranks are reading it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif /* add constant to solution to force refresh of neighbor data, if any */ for (int j=jstart_rank; j<=jend_rank; j++) for (int i=istart_rank; i<=iend_rank; i++) IN(i,j)+= 1.0; /* LOAD/STORE FENCE */ MPI_Win_sync(shm_win_in); #if LOCAL_BARRIER_SYNCH MPI_Barrier(shm_comm); // needed to avoid reading IN while other ranks are writing it #else for (int i=0; i<num_local_nbrs; i++) { MPI_Irecv(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm, &(request[i])); MPI_Send(&dummy, 0, MPI_INT, local_nbr[i], 666, shm_comm); } MPI_Waitall(num_local_nbrs, request, MPI_STATUSES_IGNORE); #endif } /* end of iterations */ local_stencil_time = wtime() - local_stencil_time; MPI_Reduce(&local_stencil_time, &stencil_time, 1, MPI_DOUBLE, MPI_MAX, root, MPI_COMM_WORLD); /* compute L1 norm in parallel */ local_norm = (DTYPE) 0.0; for (int j=MAX(jstart_rank,RADIUS); j<=MIN(n-RADIUS-1,jend_rank); j++) { for (int i=MAX(istart_rank,RADIUS); i<=MIN(n-RADIUS-1,iend_rank); i++) { local_norm += (DTYPE)ABS(OUT(i,j)); } } MPI_Reduce(&local_norm, &norm, 1, MPI_DTYPE, MPI_SUM, root, MPI_COMM_WORLD); /******************************************************************************* ** Analyze and output results. ********************************************************************************/ /* verify correctness */ if (my_ID == root) { norm /= f_active_points; if (RADIUS > 0) { reference_norm = (DTYPE) (iterations+1) * (COEFX + COEFY); } else { reference_norm = (DTYPE) 0.0; } if (ABS(norm-reference_norm) > EPSILON) { printf("ERROR: L1 norm = "FSTR", Reference L1 norm = "FSTR"\n", norm, reference_norm); error = 1; } else { printf("Solution validates\n"); #if VERBOSE printf("Reference L1 norm = "FSTR", L1 norm = "FSTR"\n", reference_norm, norm); #endif } } bail_out(error); MPI_Win_unlock_all(shm_win_in); MPI_Win_unlock_all(shm_win_out); MPI_Win_free(&shm_win_in); MPI_Win_free(&shm_win_out); if (my_ID == root) { /* flops/stencil: 2 flops (fma) for each point in the stencil, plus one flop for the update of the input of the array */ flops = (DTYPE) (2*stencil_size+1) * f_active_points; avgtime = stencil_time/iterations; printf("Rate (MFlops/s): "FSTR" Avg time (s): %lf\n", 1.0E-06 * flops/avgtime, avgtime); } MPI_Finalize(); exit(EXIT_SUCCESS); }
PetscErrorCode VecLoad_Binary(Vec vec, PetscViewer viewer) { PetscMPIInt size,rank,tag; int fd; PetscInt i,rows = 0,n,*range,N,bs; PetscErrorCode ierr; PetscBool flag; PetscScalar *avec,*avecwork; MPI_Comm comm; MPI_Request request; MPI_Status status; #if defined(PETSC_HAVE_MPIIO) PetscBool useMPIIO; #endif PetscFunctionBegin; ierr = PetscObjectGetComm((PetscObject)viewer,&comm);CHKERRQ(ierr); ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); ierr = PetscViewerBinaryGetDescriptor(viewer,&fd);CHKERRQ(ierr); ierr = PetscViewerBinaryReadVecHeader_Private(viewer,&rows);CHKERRQ(ierr); /* Set Vec sizes,blocksize,and type if not already set. Block size first so that local sizes will be compatible. */ ierr = PetscOptionsGetInt(((PetscObject)vec)->prefix, "-vecload_block_size", &bs, &flag);CHKERRQ(ierr); if (flag) { ierr = VecSetBlockSize(vec, bs);CHKERRQ(ierr); } if (vec->map->n < 0 && vec->map->N < 0) { ierr = VecSetSizes(vec,PETSC_DECIDE,rows);CHKERRQ(ierr); } /* If sizes and type already set,check if the vector global size is correct */ ierr = VecGetSize(vec, &N);CHKERRQ(ierr); if (N != rows) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_FILE_UNEXPECTED, "Vector in file different length (%d) then input vector (%d)", rows, N); #if defined(PETSC_HAVE_MPIIO) ierr = PetscViewerBinaryGetMPIIO(viewer,&useMPIIO);CHKERRQ(ierr); if (useMPIIO) { ierr = VecLoad_Binary_MPIIO(vec, viewer);CHKERRQ(ierr); PetscFunctionReturn(0); } #endif ierr = VecGetLocalSize(vec,&n);CHKERRQ(ierr); ierr = PetscObjectGetNewTag((PetscObject)viewer,&tag);CHKERRQ(ierr); ierr = VecGetArray(vec,&avec);CHKERRQ(ierr); if (!rank) { ierr = PetscBinaryRead(fd,avec,n,PETSC_SCALAR);CHKERRQ(ierr); if (size > 1) { /* read in other chuncks and send to other processors */ /* determine maximum chunck owned by other */ range = vec->map->range; n = 1; for (i=1; i<size; i++) n = PetscMax(n,range[i+1] - range[i]); ierr = PetscMalloc(n*sizeof(PetscScalar),&avecwork);CHKERRQ(ierr); for (i=1; i<size; i++) { n = range[i+1] - range[i]; ierr = PetscBinaryRead(fd,avecwork,n,PETSC_SCALAR);CHKERRQ(ierr); ierr = MPI_Isend(avecwork,n,MPIU_SCALAR,i,tag,comm,&request);CHKERRQ(ierr); ierr = MPI_Wait(&request,&status);CHKERRQ(ierr); } ierr = PetscFree(avecwork);CHKERRQ(ierr); } } else { ierr = MPI_Recv(avec,n,MPIU_SCALAR,0,tag,comm,&status);CHKERRQ(ierr); } ierr = VecRestoreArray(vec,&avec);CHKERRQ(ierr); ierr = VecAssemblyBegin(vec);CHKERRQ(ierr); ierr = VecAssemblyEnd(vec);CHKERRQ(ierr); PetscFunctionReturn(0); }
void full_verify( void ) { MPI_Status status; MPI_Request request; INT_TYPE i, j; INT_TYPE k, last_local_key; TIMER_START( T_VERIFY ); /* Now, finally, sort the keys: */ for( i=0; i<total_local_keys; i++ ) key_array[--key_buff_ptr_global[key_buff2[i]]- total_lesser_keys] = key_buff2[i]; last_local_key = (total_local_keys<1)? 0 : (total_local_keys-1); /* Send largest key value to next processor */ if( my_rank > 0 ) MPI_Irecv( &k, 1, MP_KEY_TYPE, my_rank-1, 1000, MPI_COMM_WORLD, &request ); if( my_rank < comm_size-1 ) MPI_Send( &key_array[last_local_key], 1, MP_KEY_TYPE, my_rank+1, 1000, MPI_COMM_WORLD ); if( my_rank > 0 ) MPI_Wait( &request, &status ); /* Confirm that neighbor's greatest key value is not greater than my least key value */ j = 0; if( my_rank > 0 && total_local_keys > 0 ) if( k > key_array[0] ) j++; /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ for( i=1; i<total_local_keys; i++ ) if( key_array[i-1] > key_array[i] ) j++; if( j != 0 ) { printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", my_rank, j ); } else passed_verification++; TIMER_STOP( T_VERIFY ); }
HYPRE_Int hypre_MPI_Wait( hypre_MPI_Request *request, hypre_MPI_Status *status ) { return (HYPRE_Int) MPI_Wait(request, status); }
int_t pdgstrf /************************************************************************/ ( superlu_options_t *options, int m, int n, double anorm, LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info ) /* * Purpose * ======= * * PDGSTRF performs the LU factorization in parallel. * * Arguments * ========= * * options (input) superlu_options_t* * The structure defines the input parameters to control * how the LU decomposition will be performed. * The following field should be defined: * o ReplaceTinyPivot (yes_no_t) * Specifies whether to replace the tiny diagonals by * sqrt(epsilon)*norm(A) during LU factorization. * * m (input) int * Number of rows in the matrix. * * n (input) int * Number of columns in the matrix. * * anorm (input) double * The norm of the original matrix A, or the scaled A if * equilibration was done. * * LUstruct (input/output) LUstruct_t* * The data structures to store the distributed L and U factors. * The following fields should be defined: * * o Glu_persist (input) Glu_persist_t* * Global data structure (xsup, supno) replicated on all processes, * describing the supernode partition in the factored matrices * L and U: * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * * o Llu (input/output) LocalLU_t* * The distributed data structures to store L and U factors. * See superlu_ddefs.h for the definition of 'LocalLU_t'. * * grid (input) gridinfo_t* * The 2D process mesh. It contains the MPI communicator, the number * of process rows (NPROW), the number of process columns (NPCOL), * and my process rank. It is an input argument to all the * parallel routines. * Grid can be initialized by subroutine SUPERLU_GRIDINIT. * See superlu_ddefs.h for the definition of 'gridinfo_t'. * * stat (output) SuperLUStat_t* * Record the statistics on runtime and floating-point operation count. * See util.h for the definition of 'SuperLUStat_t'. * * info (output) int* * = 0: successful exit * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, * and division by zero will occur if it is used to solve a * system of equations. * */ { #ifdef _CRAY _fcd ftcs = _cptofcd("N", strlen("N")); _fcd ftcs1 = _cptofcd("L", strlen("L")); _fcd ftcs2 = _cptofcd("N", strlen("N")); _fcd ftcs3 = _cptofcd("U", strlen("U")); #endif double alpha = 1.0, beta = 0.0; int_t *xsup; int_t *lsub, *lsub1, *usub, *Usub_buf, *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ double *lusup, *lusup1, *uval, *Uval_buf, *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, nlb, nub, nsupc, rel, rukp; int_t Pc, Pr; int iam, kcol, krow, mycol, myrow, pi, pj; int j, k, lk, nsupers; int nsupr, nbrow, segsize; int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: * 0 : transferred in Lsub_buf[] * 1 : transferred in Lval_buf[] * 2 : transferred in Usub_buf[] * 3 : transferred in Uval_buf[] */ int_t msg0, msg2; int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; double **Unzval_br_ptr, **Lnzval_bc_ptr; int_t *index; double *nzval; int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ double *ucol; int_t *indirect; double *tempv, *tempv2d; int_t iinfo; int_t *ToRecv, *ToSendD, **ToSendR; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; superlu_scope_t *scp; float s_eps; double thresh; double *tempU2d, *tempu; int full, ldt, ldu, lead_zero, ncols; MPI_Request recv_req[4], *send_req; MPI_Status status; #if ( DEBUGlevel>=2 ) int_t num_copy=0, num_update=0; #endif #if ( PRNTlevel==3 ) int_t zero_msg = 0, total_msg = 0; #endif #if ( PROFlevel>=1 ) double t1, t2; float msg_vol = 0, msg_cnt = 0; int_t iword = sizeof(int_t), dword = sizeof(double); #endif /* Test the input parameters. */ *info = 0; if ( m < 0 ) *info = -2; else if ( n < 0 ) *info = -3; if ( *info ) { pxerbla("pdgstrf", grid, -*info); return (-1); } /* Quick return if possible. */ if ( m == 0 || n == 0 ) return 0; /* * Initialization. */ iam = grid->iam; Pc = grid->npcol; Pr = grid->nprow; myrow = MYROW( iam, grid ); mycol = MYCOL( iam, grid ); nsupers = Glu_persist->supno[n-1] + 1; xsup = Glu_persist->xsup; s_eps = slamch_("Epsilon"); thresh = s_eps * anorm; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter pdgstrf()"); #endif stat->ops[FACT] = 0.0; if ( Pr*Pc > 1 ) { i = Llu->bufmax[0]; if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lsub_buf."); Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; i = Llu->bufmax[1]; if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) ABORT("Malloc fails for Lval_buf[]."); Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; if ( Llu->bufmax[2] != 0 ) if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) ABORT("Malloc fails for Usub_buf[]."); if ( Llu->bufmax[3] != 0 ) if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) ABORT("Malloc fails for Uval_buf[]."); if ( !(send_req = (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) ABORT("Malloc fails for send_req[]."); } if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) ) ABORT("Malloc fails for ujrow[]."); #if ( PRNTlevel>=1 ) if ( !iam ) { printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", Llu->bufmax[0], Llu->bufmax[1], Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); } #endif Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; Lval_buf_2[0] = Llu->Lval_buf_2[0]; Lval_buf_2[1] = Llu->Lval_buf_2[1]; Usub_buf = Llu->Usub_buf; Uval_buf = Llu->Uval_buf; Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; Unzval_br_ptr = Llu->Unzval_br_ptr; ToRecv = Llu->ToRecv; ToSendD = Llu->ToSendD; ToSendR = Llu->ToSendR; ldt = sp_ienv_dist(3); /* Size of maximum supernode */ if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) ABORT("Calloc fails for tempv2d[]."); tempU2d = tempv2d + ldt*ldt; if ( !(indirect = intMalloc_dist(ldt)) ) ABORT("Malloc fails for indirect[]."); k = CEILING( nsupers, Pr ); /* Number of local block rows */ if ( !(iuip = intMalloc_dist(k)) ) ABORT("Malloc fails for iuip[]."); if ( !(ruip = intMalloc_dist(k)) ) ABORT("Malloc fails for ruip[]."); #if ( VAMPIR>=1 ) VT_symdef(1, "Send-L", "Comm"); VT_symdef(2, "Recv-L", "Comm"); VT_symdef(3, "Send-U", "Comm"); VT_symdef(4, "Recv-U", "Comm"); VT_symdef(5, "TRF2", "Factor"); VT_symdef(100, "Factor", "Factor"); VT_begin(100); VT_traceon(); #endif /* --------------------------------------------------------------- Handle the first block column separately to start the pipeline. --------------------------------------------------------------- */ if ( mycol == 0 ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif scp = &grid->rscp; /* The scope of process row. */ /* Process column *kcol* multicasts numeric values of L(:,k) to process rows. */ lsub = Lrowind_bc_ptr[0]; lusup = Lnzval_bc_ptr[0]; if ( lsub ) { msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; msgcnt[1] = lsub[1] * SuperSize( 0 ); } else { msgcnt[0] = msgcnt[1] = 0; } for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[0][pj] != EMPTY ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, &send_req[pj] ); MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, &send_req[pj+Pc] ); #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, 0, msgcnt[0], msgcnt[1], pj); #endif #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif } } /* for pj ... */ } else { /* Post immediate receives. */ if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, 0, scp->comm, &recv_req[0] ); MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, 1, scp->comm, &recv_req[1] ); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); #endif } } /* if mycol == 0 */ /* ------------------------------------------ MAIN LOOP: Loop through all block columns. ------------------------------------------ */ for (k = 0; k < nsupers; ++k) { knsupc = SuperSize( k ); krow = PROW( k, grid ); kcol = PCOL( k, grid ); if ( mycol == kcol ) { lk = LBj( k, grid ); /* Local block number. */ for (pj = 0; pj < Pc; ++pj) { /* Wait for Isend to complete before using lsub/lusup. */ if ( ToSendR[lk][pj] != EMPTY ) { MPI_Wait( &send_req[pj], &status ); MPI_Wait( &send_req[pj+Pc], &status ); } } lsub = Lrowind_bc_ptr[lk]; lusup = Lnzval_bc_ptr[lk]; } else { if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ scp = &grid->rscp; /* The scope of process row. */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(2); #endif /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[0]);*/ /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, (4*k)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[0], &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[1]);*/ /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, (4*k+1)%NTAGS, scp->comm, &status );*/ MPI_Wait( &recv_req[1], &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); #if ( VAMPIR>=1 ) VT_end(2); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", iam, k, msgcnt[0], msgcnt[1], kcol); fflush(stdout); #endif lsub = Lsub_buf_2[k%2]; lusup = Lval_buf_2[k%2]; #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[0] ) ++zero_msg; #endif } else msgcnt[0] = 0; } /* if mycol = Pc(k) */ scp = &grid->cscp; /* The scope of process column. */ if ( myrow == krow ) { /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ #ifdef _CRAY pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); #else pdgstrs2(n, k, Glu_persist, grid, Llu, stat); #endif /* Multicasts U(k,:) to process columns. */ lk = LBi( k, grid ); usub = Ufstnz_br_ptr[lk]; uval = Unzval_br_ptr[lk]; if ( usub ) { msgcnt[2] = usub[2]; msgcnt[3] = usub[1]; } else { msgcnt[2] = msgcnt[3] = 0; } if ( ToSendD[lk] == YES ) { for (pi = 0; pi < Pr; ++pi) { if ( pi != myrow ) { #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(3); #endif MPI_Send( usub, msgcnt[2], mpi_int_t, pi, (4*k+2)%NTAGS, scp->comm); MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, (4*k+3)%NTAGS, scp->comm); #if ( VAMPIR>=1 ) VT_end(3); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); #endif } /* if pi ... */ } /* for pi ... */ } /* if ToSendD ... */ } else { /* myrow != krow */ if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(4); #endif /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, Llu->bufmax[2]);*/ MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, (4*k+2)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, Llu->bufmax[3]);*/ MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, (4*k+3)%NTAGS, scp->comm, &status ); MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); #if ( VAMPIR>=1 ) VT_end(4); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; #endif usub = Usub_buf; uval = Uval_buf; #if ( DEBUGlevel>=2 ) printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); #endif #if ( PRNTlevel==3 ) ++total_msg; if ( !msgcnt[2] ) ++zero_msg; #endif } else msgcnt[2] = 0; } /* if myrow == Pr(k) */ /* * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). * for (j = k+1; k < N; ++k) { * for (i = k+1; i < N; ++i) * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) * && L(i,k) != 0 && U(k,j) != 0 ) * A(i,j) = A(i,j) - L(i,k) * U(k,j); */ msg0 = msgcnt[0]; msg2 = msgcnt[2]; if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ nsupr = lsub[1]; /* LDA of lusup. */ if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; luptr0 = knsupc; nlb = lsub[0] - 1; } else { lptr0 = BC_HEADER; luptr0 = 0; nlb = lsub[0]; } lptr = lptr0; for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ ib = lsub[lptr]; lib = LBi( ib, grid ); iuip[lib] = BR_HEADER; ruip[lib] = 0; lptr += LB_DESCRIPTOR + lsub[lptr+1]; } nub = usub[0]; /* Number of blocks in the block row U(k,:) */ iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ rukp = 0; /* Pointer to nzval[] of U(k,:) */ klst = FstBlockC( k+1 ); /* --------------------------------------------------- Update the first block column A(:,k+1). --------------------------------------------------- */ jb = usub[iukp]; /* Global block number of block U(k,j). */ if ( jb == k+1 ) { /* First update (k+1)-th block. */ --nub; lptr = lptr0; luptr = luptr0; ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #else hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0, it = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[it++]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (it = 0, i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[it++]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* if jb == k+1 */ } /* if L(:,k) and U(k,:) not empty */ if ( k+1 < nsupers ) { kcol = PCOL( k+1, grid ); if ( mycol == kcol ) { #if ( VAMPIR>=1 ) VT_begin(5); #endif /* Factor diagonal and subdiagonal blocks and test for exact singularity. */ pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); #if ( VAMPIR>=1 ) VT_end(5); #endif /* Process column *kcol+1* multicasts numeric values of L(:,k+1) to process rows. */ lk = LBj( k+1, grid ); /* Local block number. */ lsub1 = Lrowind_bc_ptr[lk]; if ( lsub1 ) { msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; msgcnt[1] = lsub1[1] * SuperSize( k+1 ); } else { msgcnt[0] = 0; msgcnt[1] = 0; } scp = &grid->rscp; /* The scope of process row. */ for (pj = 0; pj < Pc; ++pj) { if ( ToSendR[lk][pj] != EMPTY ) { lusup1 = Lnzval_bc_ptr[lk]; #if ( PROFlevel>=1 ) TIC(t1); #endif #if ( VAMPIR>=1 ) VT_begin(1); #endif MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); #if ( VAMPIR>=1 ) VT_end(1); #endif #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; msg_cnt += 2; msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; #endif #if ( DEBUGlevel>=2 ) printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", iam, k+1, msgcnt[0], msgcnt[1], pj); #endif } } /* for pj ... */ } else { /* Post Recv of block column L(:,k+1). */ if ( ToRecv[k+1] >= 1 ) { scp = &grid->rscp; /* The scope of process row. */ MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); #if ( DEBUGlevel>=2 ) printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); #endif } } /* if mycol == Pc(k+1) */ } /* if k+1 < nsupers */ if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ /* --------------------------------------------------- Update all other blocks using block row U(k,:) --------------------------------------------------- */ for (j = 0; j < nub; ++j) { lptr = lptr0; luptr = luptr0; jb = usub[iukp]; /* Global block number of block U(k,j). */ ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ nsupc = SuperSize( jb ); iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ /* Prepare to call DGEMM. */ jj = iukp; while ( usub[jj] == klst ) ++jj; ldu = klst - usub[jj++]; ncols = 1; full = 1; for (; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { ++ncols; if ( segsize != ldu ) full = 0; if ( segsize > ldu ) ldu = segsize; } } #if ( DEBUGlevel>=3 ) printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", iam, full, k, jb, ldu, ncols, nsupc); ++num_update; #endif if ( full ) { tempu = &uval[rukp]; } else { /* Copy block U(k,j) into tempU2d. */ #if ( DEBUGlevel>=3 ) ++num_copy; #endif tempu = tempU2d; for (jj = iukp; jj < iukp+nsupc; ++jj) { segsize = klst - usub[jj]; if ( segsize ) { lead_zero = ldu - segsize; for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; tempu += lead_zero; for (i = 0; i < segsize; ++i) tempu[i] = uval[rukp+i]; rukp += segsize; tempu += segsize; } } tempu = tempU2d; rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ } /* if full ... */ for (lb = 0; lb < nlb; ++lb) { ib = lsub[lptr]; /* Row block L(i,k). */ nbrow = lsub[lptr+1]; /* Number of full rows. */ lptr += LB_DESCRIPTOR; /* Skip descriptor. */ tempv = tempv2d; #ifdef _CRAY SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #elif defined (USE_VENDOR_BLAS) dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt); #else hypre_F90_NAME_BLAS(dgemm,DGEMM)("N", "N", &nbrow, &ncols, &ldu, &alpha, &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, tempu, &ldu, &beta, tempv, &ldt, 1, 1); #endif stat->ops[FACT] += 2 * nbrow * ldu * ncols; /* Now gather the result into the destination block. */ if ( ib < jb ) { /* A(i,j) is in U. */ ilst = FstBlockC( ib+1 ); lib = LBi( ib, grid ); index = Ufstnz_br_ptr[lib]; ijb = index[iuip[lib]]; while ( ijb < jb ) { /* Search for dest block. */ ruip[lib] += index[iuip[lib]+1]; iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); ijb = index[iuip[lib]]; } /* Skip descriptor. Now point to fstnz index of block U(i,j). */ iuip[lib] += UB_DESCRIPTOR; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; fnz = index[iuip[lib]++]; if ( segsize ) { /* Nonzero segment in U(k.j). */ ucol = &Unzval_br_ptr[lib][ruip[lib]]; for (i = 0 ; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; ucol[rel] -= tempv[i]; } tempv += ldt; } ruip[lib] += ilst - fnz; } } else { /* A(i,j) is in L. */ index = Lrowind_bc_ptr[ljb]; ldv = index[1]; /* LDA of the dest lusup. */ lptrj = BC_HEADER; luptrj = 0; ijb = index[lptrj]; while ( ijb != ib ) { /* Search for dest block -- blocks are not ordered! */ luptrj += index[lptrj+1]; lptrj += LB_DESCRIPTOR + index[lptrj+1]; ijb = index[lptrj]; } /* * Build indirect table. This is needed because the * indices are not sorted for the L blocks. */ fnz = FstBlockC( ib ); lptrj += LB_DESCRIPTOR; for (i = 0; i < index[lptrj-1]; ++i) { rel = index[lptrj + i] - fnz; indirect[rel] = i; } nzval = Lnzval_bc_ptr[ljb] + luptrj; tempv = tempv2d; for (jj = 0; jj < nsupc; ++jj) { segsize = klst - usub[iukp + jj]; if ( segsize ) { /*#pragma _CRI cache_bypass nzval,tempv*/ for (i = 0; i < nbrow; ++i) { rel = lsub[lptr + i] - fnz; nzval[indirect[rel]] -= tempv[i]; } tempv += ldt; } nzval += ldv; } } /* if ib < jb ... */ lptr += nbrow; luptr += nbrow; } /* for lb ... */ rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ iukp += nsupc; } /* for j ... */ } /* if k L(:,k) and U(k,:) are not empty */ } /* ------------------------------------------ END MAIN LOOP: for k = ... ------------------------------------------ */ #if ( VAMPIR>=1 ) VT_end(100); VT_traceoff(); #endif if ( Pr*Pc > 1 ) { SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); SUPERLU_FREE(send_req); } SUPERLU_FREE(Llu->ujrow); SUPERLU_FREE(tempv2d); SUPERLU_FREE(indirect); SUPERLU_FREE(iuip); SUPERLU_FREE(ruip); /* Prepare error message. */ if ( *info == 0 ) *info = n + 1; #if ( PROFlevel>=1 ) TIC(t1); #endif MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); #if ( PROFlevel>=1 ) TOC(t2, t1); stat->utime[COMM] += t2; { float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; MPI_Reduce( &msg_cnt, &msg_cnt_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_cnt, &msg_cnt_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_sum, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); MPI_Reduce( &msg_vol, &msg_vol_max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); if ( !iam ) { printf("\tPDGSTRF comm stat:" "\tAvg\tMax\t\tAvg\tMax\n" "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", msg_cnt_sum/Pr/Pc, msg_cnt_max, msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); } } #endif if ( iinfo == n + 1 ) *info = 0; else *info = iinfo; #if ( PRNTlevel==3 ) MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); if ( !iam ) printf(".. # total msg\t%d\n", iinfo); #endif #if ( DEBUGlevel>=2 ) for (i = 0; i < Pr * Pc; ++i) { if ( iam == i ) { dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); printf("(%d)\n", iam); PrintInt10("Recv", nsupers, Llu->ToRecv); } MPI_Barrier( grid->comm ); } #endif #if ( DEBUGlevel>=3 ) printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); #endif #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit pdgstrf()"); #endif } /* PDGSTRF */
int main(int argc, char* argv[]) { int nprocs, myid; int nrl,ncl; int myrowid,mycolid; // Initialize MPI. MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &myid); printf("This is process %d out of %d processes.\n",myid,nprocs); // Exit if the number of arguments is not 5. if (argc != 7) { printf("Usage: laplace nrows ncols npcols niter iprint relerr\n"); exit(1); } // Process the command line arguments. int nr=atoi(argv[1]); int nc=atoi(argv[2]); int nprocr,nprocc=atoi(argv[3]); int niter=atoi(argv[4]); int iprint=atoi(argv[5]); double relerr=atof(argv[6]); int i,j,iter; // Get some timing information. double time1=MPI_Wtime(); // Cap the number of iterations. if (niter >= MAXITER) { printf("Warnig: Changed the number of iterations to %d.",MAXITER); niter=MAXITER; } // Check the number of columns in the process grid. if (nprocs%nprocc != 0) { if (myid == 0) printf("Error: the number of processes are not multiples of npcols!\n"); MPI_Finalize(); exit(2); } nprocr=nprocs/nprocc; myrowid=myid/nprocc; mycolid=myid-nprocc*myrowid; // Check the rows and columns. if (nr%nprocr != 0) { if (myid == 0) printf("Error: the number of rows is not multiple of the number of rows in the process grid!\n"); MPI_Finalize(); exit(3); } if (nc%nprocc != 0) { if (myid == 0) printf("Error: the number of columns is not multiple of the number of columns in the process grid!\n"); MPI_Finalize(); exit(4); } nrl=nr/nprocr; ncl=nc/nprocc; double **t; t=malloc((nrl+2)*sizeof(double *)); for (i=0;i<nrl+2;i++) t[i]=malloc((ncl+2)*sizeof(double *)); double **told; told=malloc((nrl+2)*sizeof(double *)); for (i=0;i<nrl+2;i++) told[i]=malloc((ncl+2)*sizeof(double *)); // Initialize the array. for (i=0; i<nrl+2; i++) for (j=0; j<ncl+2; j++) told[i][j]=0.; // Set the boundary condition. // Right boundary double tmin=myrowid*100.0/nprocr; double tmax=(myrowid+1)*100.0/nprocr; if (mycolid == nprocc-1) { for (i=0;i<nrl+2;i++) { t[i][ncl+1]=tmin+((tmax-tmin)/nrl)*i; told[i][ncl+1]=t[i][ncl+1]; } } // Bottom boundary tmin=mycolid*100.0/nprocc; tmax=(mycolid+1)*100.0/nprocc; if (myrowid == nprocr-1) { for (j=0;j<ncl+2;j++) { t[nrl+1][j]=tmin+((tmax-tmin)/ncl)*j; told[nrl+1][j]=t[nrl+1][j]; } } // Main loop. int tagu=100,tagd=101,tagr=102,tagl=103; MPI_Request reqid_rl,reqid_rr,reqid_ru,reqid_rd; MPI_Request reqid_sl,reqid_sr,reqid_su,reqid_sd; MPI_Status status; double dt,dtg; // Define buffers to pass the column boundary data. double *srbuffer=malloc((nrl+2)*sizeof(double *)); double *slbuffer=malloc((nrl+2)*sizeof(double *)); double *rrbuffer=malloc((nrl+2)*sizeof(double *)); double *rlbuffer=malloc((nrl+2)*sizeof(double *)); // Main loop starts here. for (iter=1;iter<=niter;iter++) { for (i=1;i<=nrl;i++) for (j=1;j<=ncl;j++) t[i][j]=0.25*(told[i+1][j]+told[i-1][j]+told[i][j-1]+told[i][j+1]); // Update the boundary data. We'll send above and receive from // below first, then send below and receive from above. In both // cases, we'll post the receives first, and then do the sends. // Fill the send buffer. if (mycolid != 0) for (i=1;i<=nrl;i++) slbuffer[i]=t[i][1]; if (mycolid != nprocc-1) for (i=1;i<=nrl;i++) srbuffer[i]=t[i][ncl]; // Send U - Recv D: if (myrowid != nprocr-1) MPI_Irecv(t[nrl+1],ncl+2,MPI_DOUBLE,myid+nprocc,tagu,MPI_COMM_WORLD,&reqid_ru); if (myrowid != 0) MPI_Isend(t[1],ncl+2,MPI_DOUBLE,myid-nprocc,tagu,MPI_COMM_WORLD,&reqid_su); // Send D - Recv U: if (myrowid != 0) MPI_Irecv(t[0],ncl+2,MPI_DOUBLE,myid-nprocc,tagd,MPI_COMM_WORLD,&reqid_rd); if (myrowid != nprocr-1) MPI_Isend(t[nrl],ncl+2,MPI_DOUBLE,myid+nprocc,tagd,MPI_COMM_WORLD,&reqid_sd); // Send R - Recv L: if (mycolid != 0) MPI_Irecv(rlbuffer,nrl+2,MPI_DOUBLE,myid-1,tagr,MPI_COMM_WORLD,&reqid_rr); if (mycolid != nprocc-1) MPI_Isend(srbuffer,nrl+2,MPI_DOUBLE,myid+1,tagr,MPI_COMM_WORLD,&reqid_sr); // Send L - Recv R: if (mycolid != nprocc-1) MPI_Irecv(rrbuffer,nrl+2,MPI_DOUBLE,myid+1,tagl,MPI_COMM_WORLD,&reqid_rl); if (mycolid != 0) MPI_Isend(slbuffer,nrl+2,MPI_DOUBLE,myid-1,tagl,MPI_COMM_WORLD,&reqid_sl); // Wait for the left and right sends and receives. if (mycolid != 0) { MPI_Wait(&reqid_sl,&status); MPI_Wait(&reqid_rr,&status); } if (mycolid != nprocc-1) { MPI_Wait(&reqid_sr,&status); MPI_Wait(&reqid_rl,&status); } // Unpack the left and right column data. if (mycolid != 0) for (i=1;i<=nrl;i++) t[i][0]=rlbuffer[i]; if (mycolid != nprocc-1) for (i=1;i<=nrl;i++) t[i][ncl+1]=rrbuffer[i]; // Wait for the above and below sends and receives. if (myrowid != 0) { MPI_Wait(&reqid_su,&status); MPI_Wait(&reqid_rd,&status); } if (myrowid != nprocr-1) { MPI_Wait(&reqid_sd,&status); MPI_Wait(&reqid_ru,&status); } // Check for sL rR /* if (myid == 3) { for (i=0;i<nrl+2;i++) printf("%f ",t[i][1]); printf("\n"); } MPI_Barrier(MPI_COMM_WORLD); if (myid == 2) { for (i=0;i<nrl+2;i++) printf("%f ",t[i][ncl+1]); printf("\n"); } */ // Check on convergence dt=0; for (i=1;i<=nrl;i++) { for (j=1;j<=ncl;j++) { dt=fmax(fabs(t[i][j]-told[i][j]),dt); told[i][j]=t[i][j]; } } // Find the global max convergence error. MPI_Allreduce(&dt,&dtg,1,MPI_DOUBLE,MPI_MAX,MPI_COMM_WORLD); // Check if output is required. if (myid == 0) if (iprint != 0) if (iter%iprint == 0) printf("Iteration: %d; Convergence Error: %f\n",iter,dtg); // Check if convergence criteria meet. if (dtg < relerr) { printf("\nSolution has converged.\n"); break; } } free(t); free(told); // Print out the execution time. if (myid == 0) { double time2=MPI_Wtime(); printf("\nTotal Time (sec): %f.\n",time2-time1); } MPI_Finalize(); return 0; }
void mpi_wait_(int* request, MPI_Status* status, int* ierr) { MPI_Request req = find_request(*request); *ierr = MPI_Wait(&req, status); }
void wet::exchangeDensities(void) { //if(t%infoStep==0) // cout << "Process " << rank << ": exchanging densities...." << endl; MPI_Status statusfn1, statusfn2, statusfna, statusfnb, statusfnc, statusfnd, statusfni, statusfnj, statusfnk, statusfnl, statusgn1, statusgn2, statusgna, statusgnb, statusgnc, statusgnd, statusgni, statusgnj, statusgnk, statusgnl; MPI_Request requestOutfn1, requestOutfn2, requestInfn1, requestInfn2, requestOutfna, requestOutfnb, requestInfna, requestInfnb, requestOutfnc, requestOutfnd, requestInfnc, requestInfnd, requestOutfni, requestOutfnj, requestInfni, requestInfnj, requestOutfnk, requestOutfnl, requestInfnk, requestInfnl,requestOutgn1, requestOutgn2, requestIngn1, requestIngn2, requestOutgna, requestOutgnb, requestIngna, requestIngnb, requestOutgnc, requestOutgnd, requestIngnc, requestIngnd, requestOutgni, requestOutgnj, requestIngni, requestIngnj, requestOutgnk, requestOutgnl, requestIngnk, requestIngnl; //SENDING FF DENSITIES //Sending right (for sender) fn1 (only right part, because fn1 moves in x direction) MPI_Isend(&(fn1[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100 , MPI_COMM_WORLD, &requestOutfn1); //Sending left (for sender) fn2 (only left part, because fn2 moves in -x direction) MPI_Isend(&(fn2[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+1 , MPI_COMM_WORLD, &requestOutfn2); //Sending right (for sender) fna (only right part, because fna moves in x direction) MPI_Isend(&(fna[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+2 , MPI_COMM_WORLD, &requestOutfna); //Sending left (for sender) fnb (only left part, because fnb moves in -x direction) MPI_Isend(&(fnb[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+3 , MPI_COMM_WORLD, &requestOutfnb); //Sending right (for sender) fnc (only right part, because fnc moves in x direction) MPI_Isend(&(fnc[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+4 , MPI_COMM_WORLD, &requestOutfnc); //Sending left (for sender) fnd (only left part, because fnd moves in -x direction) MPI_Isend(&(fnd[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+5 , MPI_COMM_WORLD, &requestOutfnd); //Sending right (for sender) fni (only right part, because fni moves in x direction) MPI_Isend(&(fni[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+6 , MPI_COMM_WORLD, &requestOutfni); //Sending left (for sender) fnj (only left part, because fnj moves in -x direction) MPI_Isend(&(fnj[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+7 , MPI_COMM_WORLD, &requestOutfnj); //Sending right (for sender) fnk (only right part, because fnk moves in x direction) MPI_Isend(&(fnk[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+8 , MPI_COMM_WORLD, &requestOutfnk); //Sending left (for sender) fnl (only left part, because fnl moves in -x direction) MPI_Isend(&(fnl[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+9 , MPI_COMM_WORLD, &requestOutfnl); //RECIEVING FF DENSITIES //Recieving left (for reciever) fn1 (only left part, because fn1 moves in x direction) MPI_Irecv(fn1, k1, MPI_DOUBLE, leftProcess, leftProcess*100, MPI_COMM_WORLD, &requestInfn1); //Recieving right (for reciever) fn2 (only right part, because fn2 moves in -x direction) MPI_Irecv(&(fn2[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+1, MPI_COMM_WORLD, &requestInfn2); //Recieving left (for reciever) fna (only left part, because fna moves in x direction) MPI_Irecv(fna, k1, MPI_DOUBLE, leftProcess, leftProcess*100+2, MPI_COMM_WORLD, &requestInfna); //Recieving right (for reciever) fnb (only right part, because fnb moves in -x direction) MPI_Irecv(&(fnb[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+3, MPI_COMM_WORLD, &requestInfnb); //Recieving left (for reciever) fn1 (only left part, because fnc moves in x direction) MPI_Irecv(fnc, k1, MPI_DOUBLE, leftProcess, leftProcess*100+4, MPI_COMM_WORLD, &requestInfnc); //Recieving right (for reciever) fn2 (only right part, because fnd moves in -x direction) MPI_Irecv(&(fnd[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+5, MPI_COMM_WORLD, &requestInfnd); //Recieving left (for reciever) fna (only left part, because fni moves in x direction) MPI_Irecv(fni, k1, MPI_DOUBLE, leftProcess, leftProcess*100+6, MPI_COMM_WORLD, &requestInfni); //Recieving right (for reciever) fnb (only right part, because fnj moves in -x direction) MPI_Irecv(&(fnj[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+7, MPI_COMM_WORLD, &requestInfnj); //Recieving left (for reciever) fnk (only left part, because fnk moves in x direction) MPI_Irecv(fnk, k1, MPI_DOUBLE, leftProcess, leftProcess*100+8, MPI_COMM_WORLD, &requestInfnk); //Recieving right (for reciever) fnl (only right part, because fnl moves in -x direction) MPI_Irecv(&(fnl[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+9, MPI_COMM_WORLD, &requestInfnl); //SENDING GG DENSITIES //Sending right (for sender) gn1 (only right part, because gn1 moves in x direction) MPI_Isend(&(gn1[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+10 , MPI_COMM_WORLD, &requestOutgn1); //Sending left (for sender) gn2 (only left part, because gn2 moves in -x direction) MPI_Isend(&(gn2[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+11 , MPI_COMM_WORLD, &requestOutgn2); //Sending right (for sender) gna (only right part, because gna moves in x direction) MPI_Isend(&(gna[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+12 , MPI_COMM_WORLD, &requestOutgna); //Sending left (for sender) gnb (only left part, because gnb moves in -x direction) MPI_Isend(&(gnb[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+13 , MPI_COMM_WORLD, &requestOutgnb); //Sending right (for sender) gnc (only right part, because gnc moves in x direction) MPI_Isend(&(gnc[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+14 , MPI_COMM_WORLD, &requestOutgnc); //Sending left (for sender) gnd (only left part, because gnd moves in -x direction) MPI_Isend(&(gnd[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+15, MPI_COMM_WORLD, &requestOutgnd); //Sending right (for sender) gni (only right part, because gni moves in x direction) MPI_Isend(&(gni[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+16 , MPI_COMM_WORLD, &requestOutgni); //Sending left (for sender) gnj (only left part, because gnj moves in -x direction) MPI_Isend(&(gnj[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+17 , MPI_COMM_WORLD, &requestOutgnj); //Sending right (for sender) gnk (only right part, because gnk moves in x direction) MPI_Isend(&(gnk[k2-k1]),k1, MPI_DOUBLE, rightProcess, rank*100+18 , MPI_COMM_WORLD, &requestOutgnk); //Sending left (for sender) gnl (only left part, because gnl moves in -x direction) MPI_Isend(&(gnl[k1]),k1, MPI_DOUBLE, leftProcess, rank*100+19 , MPI_COMM_WORLD, &requestOutgnl); //RECIEVING GG DENSITIES //Recieving left (for reciever) gn1 (only left part, because gn1 moves in x direction) MPI_Irecv(gn1, k1, MPI_DOUBLE, leftProcess, leftProcess*100+10, MPI_COMM_WORLD, &requestIngn1); //Recieving right (for reciever) gn2 (only right part, because gn2 moves in -x direction) MPI_Irecv(&(gn2[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+11, MPI_COMM_WORLD, &requestIngn2); //Recieving left (for reciever) gna (only left part, because gna moves in x direction) MPI_Irecv(gna, k1, MPI_DOUBLE, leftProcess, leftProcess*100+12, MPI_COMM_WORLD, &requestIngna); //Recieving right (for reciever) gnb (only right part, because gnb moves in -x direction) MPI_Irecv(&(gnb[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+13, MPI_COMM_WORLD, &requestIngnb); //Recieving left (for reciever) gn1 (only left part, because gnc moves in x direction) MPI_Irecv(gnc, k1, MPI_DOUBLE, leftProcess, leftProcess*100+14, MPI_COMM_WORLD, &requestIngnc); //Recieving right (for reciever) gn2 (only right part, because gnd moves in -x direction) MPI_Irecv(&(gnd[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+15, MPI_COMM_WORLD, &requestIngnd); //Recieving left (for reciever) gna (only left part, because gni moves in x direction) MPI_Irecv(gni, k1, MPI_DOUBLE, leftProcess, leftProcess*100+16, MPI_COMM_WORLD, &requestIngni); //Recieving right (for reciever) gnb (only right part, because gnj moves in -x direction) MPI_Irecv(&(gnj[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+17, MPI_COMM_WORLD, &requestIngnj); //Recieving left (for reciever) gnk (only left part, because gnk moves in x direction) MPI_Irecv(gnk, k1, MPI_DOUBLE, leftProcess, leftProcess*100+18, MPI_COMM_WORLD, &requestIngnk); //Recieving right (for reciever) gnl (only right part, because gnl moves in -x direction) MPI_Irecv(&(gnl[k2]), k1, MPI_DOUBLE, rightProcess, rightProcess*100+19, MPI_COMM_WORLD, &requestIngnl); //Waiting until fn(i) densities are saved in the recieving bufner MPI_Wait(&requestInfn1, &statusfn1); MPI_Wait(&requestInfn2, &statusfn2); MPI_Wait(&requestInfna, &statusfna); MPI_Wait(&requestInfnb, &statusfnb); MPI_Wait(&requestInfnc, &statusfnc); MPI_Wait(&requestInfnd, &statusfnd); MPI_Wait(&requestInfni, &statusfni); MPI_Wait(&requestInfnj, &statusfnj); MPI_Wait(&requestInfnk, &statusfnk); MPI_Wait(&requestInfnl, &statusfnl); //Waiting until gn(i) densities are saved in the recieving bufner MPI_Wait(&requestIngn1, &statusgn1); MPI_Wait(&requestIngn2, &statusgn2); MPI_Wait(&requestIngna, &statusgna); MPI_Wait(&requestIngnb, &statusgnb); MPI_Wait(&requestIngnc, &statusgnc); MPI_Wait(&requestIngnd, &statusgnd); MPI_Wait(&requestIngni, &statusgni); MPI_Wait(&requestIngnj, &statusgnj); MPI_Wait(&requestIngnk, &statusgnk); MPI_Wait(&requestIngnl, &statusgnl); //Waiting until fn(i) sending bufner is relased MPI_Wait(&requestOutfn1, &statusfn1); MPI_Wait(&requestOutfn2, &statusfn2); MPI_Wait(&requestOutfna, &statusfna); MPI_Wait(&requestOutfnb, &statusfnb); MPI_Wait(&requestOutfnc, &statusfnc); MPI_Wait(&requestOutfnd, &statusfnd); MPI_Wait(&requestOutfni, &statusfni); MPI_Wait(&requestOutfnj, &statusfnj); MPI_Wait(&requestOutfnk, &statusfnk); MPI_Wait(&requestOutfnl, &statusfnl); //Waiting until gn(i) sending bufner is relased MPI_Wait(&requestOutgn1, &statusgn1); MPI_Wait(&requestOutgn2, &statusgn2); MPI_Wait(&requestOutgna, &statusgna); MPI_Wait(&requestOutgnb, &statusgnb); MPI_Wait(&requestOutgnc, &statusgnc); MPI_Wait(&requestOutgnd, &statusgnd); MPI_Wait(&requestOutgni, &statusgni); MPI_Wait(&requestOutgnj, &statusgnj); MPI_Wait(&requestOutgnk, &statusgnk); MPI_Wait(&requestOutgnl, &statusgnl); MPI_Barrier(MPI_COMM_WORLD); //if(t%infoStep==0) // cout << "Process " << rank << ": dendities exchanged." << endl; }
int main(int argc, char **argv) { //double tend = 1E2, speed = 1.; double tend = 1E-1, speed = 1.; char *init_type = "mixed2"; double *roots, *weights, *ll, *dl, xmin, xmax, lxmin, lxmax, deltax, jac, xr, xl, cfl, dt, rtime, min_dx; int ii, jj, kk, ee, idx, eres; long nstep; double *dx, *mesh; double *smat, *xx, *qq, *qtemp, *k1, *k2, *k3, *k4, *minv_vec, *mmat, *dv, *mf, *ib, *df, *fstar; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); para_range(0, tne, nprocs, rank, &ista, &iend); ne = iend - ista; // initialize // fortran index structure array[ii,jj,ee] where size(array) = (np, np, ne) // c 1d index structure array = [ee*np*np + jj*np + ii] roots = (double *)malloc(np * sizeof(double)); weights = (double *)malloc(np * sizeof(double)); ll = (double *)malloc(np * sizeof(double)); dl = (double *)malloc(np * sizeof(double)); dx = (double *)malloc(ne * sizeof(double)); mesh = (double *)malloc((ne + 1) * sizeof(double)); smat = (double *)malloc(np * np * sizeof(double)); // [jj np, ii np] xx = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] qq = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] qtemp = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] k1 = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] k2 = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] k3 = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] k4 = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] minv_vec = (double *)malloc(ne * np * sizeof(double)); // [ee ne, ii np] mmat = (double *)malloc(ne * np * np * sizeof(double)); // [ee ne, jj np, ii np] dv = (double *)malloc(ne * np * np * sizeof(double)); // [ee ne, jj np, ii np] mf = (double *)malloc(2 * np * sizeof(double)); // [jj 2, ii np] ib = (double *)malloc(2 * np * sizeof(double)); // [jj 2, ii np] fstar = (double *)malloc(2 * ne * sizeof(double)); // [jj 2, ii ne] df = (double *)malloc(ne * 2 * np * sizeof(double)); // [ee ne, jj 2, ii np] for (ii = 0; ii < np; ++ii) { roots[ii] = 0; weights[ii] = 0; ll[ii] = 0; dl[ii] = 0; } for (ii = 0; ii < ne; ++ii) { dx[ii] = 0; mesh[ii] = 0; } mesh[ne] = 0; for (ii = 0; ii < np * np; ++ii) { smat[ii] = 0; } for (ii = 0; ii < ne * np; ++ii) { xx[ii] = 0; qq[ii] = 0; k1[ii] = 0; k2[ii] = 0; k3[ii] = 0; k4[ii] = 0; minv_vec[ii] = 0; } for (ii = 0; ii < ne * np * np; ++ii) { mmat[ii] = 0; dv[ii] = 0; } for (ii = 0; ii < np * 2; ++ii) { mf[ii] = 0; ib[ii] = 0; } for (ii = 0; ii < ne * 2; ++ii) { fstar[ii] = 0; } for (ii = 0; ii < ne * 2 * np; ++ii) { df[ii] = 0; } // mesh setup xmin = 0.; xmax = 10.; deltax = (xmax-xmin)/(double)tne; /** * lxim, lxmax를 이용하여 각 구간의 mesh[ee]를 구한다 * ne의 크기가 tne / process의 개수이기 때문에, * 각 구간에 맞는 mesh[ee]를 구해야 한다. * 그리고 mesh[ee]를 이용하여 각 변수들을 초기화 한다. */ lxmin = xmin + (ista)*deltax; lxmax = xmin + (iend)*deltax; /** * mesh[ne]은 마지막 원소가 아니라는점에 유의한다. */ mesh[ne] = lxmax; for(ee=0;ee<ne;++ee){ mesh[ee] = lxmin+ee*deltax; } // gauss lobatto quadrature point, weight setup gausslobatto_quadrature(np, roots, weights); // coordinates and element size min_dx = xmax - xmin; // initial guess for (ee = 0; ee < ne; ee++) { xl = mesh[ee]; xr = mesh[ee + 1]; dx[ee] = xr - xl; // size of each element if (dx[ee] < min_dx) { min_dx = dx[ee]; // finding minimum dx } for (ii = 0; ii < np; ii++) { idx = ee * np + ii; xx[idx] = xl + 0.5 * (1 + roots[ii]) * dx[ee]; } } // mass matrix for (ii = 0; ii < ne * np * np; ii++) { mmat[ii] = 0; } for (ee = 0; ee < ne; ee++) { jac = fabs(dx[ee]) / 2; for (kk = 0; kk < np; kk++) { lagrange(roots[kk], ll, roots); for (jj = 0; jj < np; jj++) { for (ii = 0; ii < np; ii++) { idx = ee * np * np + jj * np + ii; // mass matrix mmat[ne][np][np] in 1d index representation mmat[idx] += jac * weights[kk] * ll[ii] * ll[jj]; } } } } // stiffness matrix for (ii = 0; ii < np * np; ii++) { smat[ii] = 0; } for (kk = 0; kk < np; kk++) { lagrange(roots[kk], ll, roots); lagrange_deriv(roots[kk], dl, roots); for (jj = 0; jj < np; jj++) { for (ii = 0; ii < np; ii++) { idx = jj * np + ii; // stiffness matrix smat[np][np] in 1d index representation smat[idx] += weights[kk] * ll[jj] * dl[ii]; } } } // face integration for (ii = 0; ii < np * 2; ii++) { mf[ii] = 0; } lagrange(-1, mf, roots); // mf[ii] for(ii=0, ii<np,ii++) represents element left face integration lagrange(1, mf + np, roots); // mf[ii] for ii=np, ii<2*np, ii++) reresents element right face integration // boundary interpolation for (ii = 0; ii < np * 2; ii++) { ib[ii] = 0; } lagrange(-1, ib, roots); // element left edge interpolation lagrange(1, ib + np, roots); // element right edge interpolation // divergence operators for (ii = 0; ii < ne * np * np; ii++) { dv[ii] = 0; } for (ii = 0; ii < ne * np * 2; ii++) { dv[ii] = 0; } for (ee = 0; ee < ne; ee++) { for (jj = 0; jj < np; jj++) { // it turn out that mmat is diagonal. i.e., ii != jj, mmat[ee][jj][ii] = 0 // the inverse of mmat is just the inverse of the diagonal components // here, we are extracting the inverse diagonal components only minv_vec[ee * np + jj] = 1. / mmat[ee * np * np + jj * np + jj]; } for (jj = 0; jj < np; jj++) { for (ii = 0; ii < np; ii++) { dv[ee * np * np + jj * np + ii] = minv_vec[ee * np + ii] * smat[jj * np + ii]; } } for (jj = 0; jj < 2; jj++) { for (ii = 0; ii < np; ii++) { df[ee * np * 2 + jj * np + ii] = minv_vec[ee * np + ii] * mf[jj * np + ii]; } } } // initialize qq field initialize(qq, xx, xmax, xmin, init_type); cfl = 1. / (np * np); dt = cfl * min_dx / fabs(speed); rtime = 0.; nstep = 0; printf("Start Time Integration\n"); // Runge-Kutta 4th order Time integration loop t_sta = clock(); while (rtime < tend) { dt = fmin(dt, tend - rtime); rhs(qq, k1, dv, df, ib, speed); for (ii = 0; ii < ne * np; ii++) qtemp[ii] = qq[ii] + 0.5 * dt * k1[ii]; rhs(qtemp, k2, dv, df, ib, speed); for (ii = 0; ii < ne * np; ii++) qtemp[ii] = qq[ii] + 0.5 * dt * k2[ii]; rhs(qtemp, k3, dv, df, ib, speed); for (ii = 0; ii < ne * np; ii++) qtemp[ii] = qq[ii] + dt * k3[ii]; rhs(qtemp, k4, dv, df, ib, speed); for (ii = 0; ii < ne * np; ii++) qq[ii] += 1. / 6. * dt * (k1[ii] + 2 * k2[ii] + 2 * k3[ii] + k4[ii]); rtime += dt; nstep += 1; if (nstep % 10000 == 0 && rank == 0) printf("nstep = %10ld, %5.1f%% complete\n", nstep, rtime / tend * 100); } // timeloop ends here; if (rank != 0) { int nne = iend - ista; MPI_Isend(&nne, 1, MPI_INT, 0, 11, MPI_COMM_WORLD, &ser1); MPI_Isend(xx, ne * np, MPI_DOUBLE, 0, 22, MPI_COMM_WORLD, &ser2); MPI_Isend(qq, ne * np, MPI_DOUBLE, 0, 33, MPI_COMM_WORLD, &ser3); MPI_Wait(&ser1, &st); MPI_Wait(&ser2, &st); MPI_Wait(&ser3, &st); } double *bufx; double *bufq; int *istart; int *idisp; if (rank == 0) { printf("Integration complete\n"); if (tne > 200) { eres = 2; } else if (tne > 60) { eres = 3; } else if (tne > 30) { eres = 6; } else { eres = 10; } // final report printf("-----------------------------------------------\n"); printf("code type : c serial\n"); printf("Final time : %13.5e\n", rtime); printf("CFL : %13.5e\n", cfl); printf("DOF : %13d\n", tne * np); printf("No. of Elem : %13d\n", tne); printf("Order : %13d\n", np); printf("eres : %13d\n", eres); printf("time steps : %13ld\n", nstep); printf("-----------------------------------------------\n"); bufx = (double *)malloc(sizeof(double) * tne * np); bufq = (double *)malloc(sizeof(double) * tne * np); for (int i = 0; i < ne * np; i++) { bufx[i] = xx[i]; bufq[i] = qq[i]; } } if (rank == 0) { int index[nprocs]; index[0] = ne * np; int idx = index[0]; for (int i = 1; i < nprocs; i++) { MPI_Irecv(index + i, 1, MPI_INT, i, 11, MPI_COMM_WORLD, &rer1); MPI_Wait(&rer1, &st); index[i] *= np; MPI_Irecv(bufx + idx, index[i], MPI_DOUBLE, i, 22, MPI_COMM_WORLD, &rer2); MPI_Irecv(bufq + idx, index[i], MPI_DOUBLE, i, 33, MPI_COMM_WORLD, &rer3); MPI_Wait(&rer2, &st); MPI_Wait(&rer3, &st); idx += index[i]; } for(int i = 0; i < tne*np; i++){ printf("%f ", bufx[i]); } printf("\n"); for(int i = 0; i < tne*np; i++){ printf("%f ", bufq[i]); } printf("\n"); save_field(bufx, bufq, tne, roots, eres); t_end = clock(); printf("Motion time = %f msec\n", (double)(t_end - t_sta) / 1000.0); } free(roots); free(weights); free(ll); free(dl); free(dx); free(mesh); free(smat); free(xx); free(qq); free(qtemp); free(k1); free(k2); free(k3); free(k4); free(minv_vec); free(mmat); free(dv); free(mf); free(ib); free(fstar); free(df); MPI_Finalize(); return 0; }
void Initial_CntCoes2(double *****nh, double *****OLP) { static int firsttime=1; int i,j,l,n,n2,i1,j1; int wan; int po; int Second_switch; double time0; int Mc_AN,Gc_AN,wanA; int q,q0,al0,al1,pmax; int Mul0,Mul1,deg_on,deg_num; int al,p,L0,M0,p0,Np; int ig,im,jg,ian,jan,kl,m,Gi; int mu,nu,Anum,Bnum,NUM,maxp; int h_AN,Gh_AN,Hwan,tno1,tno2,Cwan,spin; double Beta0,scaleF,maxc; double *ko,*C0,*koSys; double **S,**Hks,**D,*abs_sum,*M1,**C,**B; int *jun,*ponu; double tmp0,tmp1,Max0,rc1,fugou,MaxV; double sum,TZ; double Num_State,x,FermiF,Dnum; double LChemP_MAX,LChemP_MIN,LChemP; double TStime,TEtime; double *tmp_array; double *tmp_array2; int *MP,*dege; int **tmp_index; int ***tmp_index1; int ***tmp_index2; double *Tmp_CntCoes; double **Check_ko; double *Weight_ko; double ***CntCoes_Spe; double ***My_CntCoes_Spe; double **InProd; int *Snd_CntCoes_Size; int *Rcv_CntCoes_Size; int *Snd_H_Size,*Rcv_H_Size; int *Snd_S_Size,*Rcv_S_Size; int size1,size2,num; int numprocs,myid,tag=999,ID,IDS,IDR; MPI_Status stat; MPI_Request request; /* MPI */ MPI_Comm_size(mpi_comm_level1,&numprocs); MPI_Comm_rank(mpi_comm_level1,&myid); dtime(&TStime); /**************************************************** allocation of arrays: int MP[List_YOUSO[8]]; int tmp_index[List_YOUSO[25]+1] [2*(List_YOUSO[25]+1)+1]; int tmp_index1[List_YOUSO[25]+1] [List_YOUSO[24]] [2*(List_YOUSO[25]+1)+1]; int tmp_index2[List_YOUSO[25]+1] [List_YOUSO[24]] [2*(List_YOUSO[25]+1)+1]; double Tmp_CntCoes[List_YOUSO[24]] double Check_ko[List_YOUSO[25]+1] [2*(List_YOUSO[25]+1)+1]; double Weight_ko[List_YOUSO[7]]; ****************************************************/ MP = (int*)malloc(sizeof(int)*List_YOUSO[8]); tmp_index = (int**)malloc(sizeof(int*)*(List_YOUSO[25]+1)); for (i=0; i<(List_YOUSO[25]+1); i++){ tmp_index[i] = (int*)malloc(sizeof(int)*(2*(List_YOUSO[25]+1)+1)); } tmp_index1 = (int***)malloc(sizeof(int**)*(List_YOUSO[25]+1)); for (i=0; i<(List_YOUSO[25]+1); i++){ tmp_index1[i] = (int**)malloc(sizeof(int*)*List_YOUSO[24]); for (j=0; j<List_YOUSO[24]; j++){ tmp_index1[i][j] = (int*)malloc(sizeof(int)*(2*(List_YOUSO[25]+1)+1)); } } tmp_index2 = (int***)malloc(sizeof(int**)*(List_YOUSO[25]+1)); for (i=0; i<(List_YOUSO[25]+1); i++){ tmp_index2[i] = (int**)malloc(sizeof(int*)*List_YOUSO[24]); for (j=0; j<List_YOUSO[24]; j++){ tmp_index2[i][j] = (int*)malloc(sizeof(int)*(2*(List_YOUSO[25]+1)+1)); } } Tmp_CntCoes = (double*)malloc(sizeof(double)*List_YOUSO[24]); Check_ko = (double**)malloc(sizeof(double*)*(List_YOUSO[25]+1)); for (i=0; i<(List_YOUSO[25]+1); i++){ Check_ko[i] = (double*)malloc(sizeof(double)*(2*(List_YOUSO[25]+1)+1)); } Weight_ko = (double*)malloc(sizeof(double)*List_YOUSO[7]); Snd_CntCoes_Size = (int*)malloc(sizeof(int)*Num_Procs); Rcv_CntCoes_Size = (int*)malloc(sizeof(int)*Num_Procs); Snd_H_Size = (int*)malloc(sizeof(int)*Num_Procs); Rcv_H_Size = (int*)malloc(sizeof(int)*Num_Procs); Snd_S_Size = (int*)malloc(sizeof(int)*Num_Procs); Rcv_S_Size = (int*)malloc(sizeof(int)*Num_Procs); /* PrintMemory */ if (firsttime) { PrintMemory("Initial_CntCoes: tmp_index",sizeof(int)*(List_YOUSO[25]+1)* (2*(List_YOUSO[25]+1)+1),NULL); PrintMemory("Initial_CntCoes: tmp_index1",sizeof(int)*(List_YOUSO[25]+1)* List_YOUSO[24]*(2*(List_YOUSO[25]+1)+1) ,NULL); PrintMemory("Initial_CntCoes: tmp_index2",sizeof(int)*(List_YOUSO[25]+1)* List_YOUSO[24]*(2*(List_YOUSO[25]+1)+1) ,NULL); PrintMemory("Initial_CntCoes: Check_ko",sizeof(double)*(List_YOUSO[25]+1)* (2*(List_YOUSO[25]+1)+1),NULL); firsttime=0; } /**************************************************** MPI: nh(=H) ****************************************************/ /*********************************** set data size ************************************/ for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; if (ID!=0){ tag = 999; /* find data size to send block data */ if (F_Snd_Num[IDS]!=0){ size1 = 0; for (spin=0; spin<=SpinP_switch; spin++){ for (n=0; n<F_Snd_Num[IDS]; n++){ Mc_AN = Snd_MAN[IDS][n]; Gc_AN = Snd_GAN[IDS][n]; Cwan = WhatSpecies[Gc_AN]; tno1 = Spe_Total_NO[Cwan]; for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){ Gh_AN = natn[Gc_AN][h_AN]; Hwan = WhatSpecies[Gh_AN]; tno2 = Spe_Total_NO[Hwan]; size1 += tno1*tno2; } } } Snd_H_Size[IDS] = size1; MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request); } else{ Snd_H_Size[IDS] = 0; } /* receiving of size of data */ if (F_Rcv_Num[IDR]!=0){ MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat); Rcv_H_Size[IDR] = size2; } else{ Rcv_H_Size[IDR] = 0; } if (F_Snd_Num[IDS]!=0) MPI_Wait(&request,&stat); } else{ Snd_H_Size[IDS] = 0; Rcv_H_Size[IDR] = 0; } } /*********************************** data transfer ************************************/ tag = 999; for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; if (ID!=0){ /***************************** sending of data *****************************/ if (F_Snd_Num[IDS]!=0){ size1 = Snd_H_Size[IDS]; /* allocation of array */ tmp_array = (double*)malloc(sizeof(double)*size1); /* multidimentional array to vector array */ num = 0; for (spin=0; spin<=SpinP_switch; spin++){ for (n=0; n<F_Snd_Num[IDS]; n++){ Mc_AN = Snd_MAN[IDS][n]; Gc_AN = Snd_GAN[IDS][n]; Cwan = WhatSpecies[Gc_AN]; tno1 = Spe_Total_NO[Cwan]; for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){ Gh_AN = natn[Gc_AN][h_AN]; Hwan = WhatSpecies[Gh_AN]; tno2 = Spe_Total_NO[Hwan]; for (i=0; i<tno1; i++){ for (j=0; j<tno2; j++){ tmp_array[num] = nh[spin][Mc_AN][h_AN][i][j]; num++; } } } } } MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request); } /***************************** receiving of block data *****************************/ if (F_Rcv_Num[IDR]!=0){ size2 = Rcv_H_Size[IDR]; /* allocation of array */ tmp_array2 = (double*)malloc(sizeof(double)*size2); MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat); num = 0; for (spin=0; spin<=SpinP_switch; spin++){ Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */ for (n=0; n<F_Rcv_Num[IDR]; n++){ Mc_AN++; Gc_AN = Rcv_GAN[IDR][n]; Cwan = WhatSpecies[Gc_AN]; tno1 = Spe_Total_NO[Cwan]; for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){ Gh_AN = natn[Gc_AN][h_AN]; Hwan = WhatSpecies[Gh_AN]; tno2 = Spe_Total_NO[Hwan]; for (i=0; i<tno1; i++){ for (j=0; j<tno2; j++){ nh[spin][Mc_AN][h_AN][i][j] = tmp_array2[num]; num++; } } } } } /* freeing of array */ free(tmp_array2); } if (F_Snd_Num[IDS]!=0){ MPI_Wait(&request,&stat); free(tmp_array); /* freeing of array */ } } } /**************************************************** MPI: OLP[0] ****************************************************/ /*********************************** set data size ************************************/ for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; if (ID!=0){ tag = 999; /* find data size to send block data */ if (F_Snd_Num[IDS]!=0){ size1 = 0; for (n=0; n<F_Snd_Num[IDS]; n++){ Mc_AN = Snd_MAN[IDS][n]; Gc_AN = Snd_GAN[IDS][n]; Cwan = WhatSpecies[Gc_AN]; tno1 = Spe_Total_NO[Cwan]; for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){ Gh_AN = natn[Gc_AN][h_AN]; Hwan = WhatSpecies[Gh_AN]; tno2 = Spe_Total_NO[Hwan]; size1 += tno1*tno2; } } Snd_S_Size[IDS] = size1; MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request); } else{ Snd_S_Size[IDS] = 0; } /* receiving of size of data */ if (F_Rcv_Num[IDR]!=0){ MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat); Rcv_S_Size[IDR] = size2; } else{ Rcv_S_Size[IDR] = 0; } if (F_Snd_Num[IDS]!=0) MPI_Wait(&request,&stat); } else{ Snd_S_Size[IDS] = 0; Rcv_S_Size[IDR] = 0; } } /*********************************** data transfer ************************************/ tag = 999; for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; if (ID!=0){ /***************************** sending of data *****************************/ if (F_Snd_Num[IDS]!=0){ size1 = Snd_S_Size[IDS]; /* allocation of array */ tmp_array = (double*)malloc(sizeof(double)*size1); /* multidimentional array to vector array */ num = 0; for (n=0; n<F_Snd_Num[IDS]; n++){ Mc_AN = Snd_MAN[IDS][n]; Gc_AN = Snd_GAN[IDS][n]; Cwan = WhatSpecies[Gc_AN]; tno1 = Spe_Total_NO[Cwan]; for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){ Gh_AN = natn[Gc_AN][h_AN]; Hwan = WhatSpecies[Gh_AN]; tno2 = Spe_Total_NO[Hwan]; for (i=0; i<tno1; i++){ for (j=0; j<tno2; j++){ tmp_array[num] = OLP[0][Mc_AN][h_AN][i][j]; num++; } } } } MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request); } /***************************** receiving of block data *****************************/ if (F_Rcv_Num[IDR]!=0){ size2 = Rcv_S_Size[IDR]; /* allocation of array */ tmp_array2 = (double*)malloc(sizeof(double)*size2); MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat); num = 0; Mc_AN = S_TopMAN[IDR] - 1; /* S_TopMAN should be used. */ for (n=0; n<F_Rcv_Num[IDR]; n++){ Mc_AN++; Gc_AN = Rcv_GAN[IDR][n]; Cwan = WhatSpecies[Gc_AN]; tno1 = Spe_Total_NO[Cwan]; for (h_AN=0; h_AN<=FNAN[Gc_AN]; h_AN++){ Gh_AN = natn[Gc_AN][h_AN]; Hwan = WhatSpecies[Gh_AN]; tno2 = Spe_Total_NO[Hwan]; for (i=0; i<tno1; i++){ for (j=0; j<tno2; j++){ OLP[0][Mc_AN][h_AN][i][j] = tmp_array2[num]; num++; } } } } /* freeing of array */ free(tmp_array2); } if (F_Snd_Num[IDS]!=0){ MPI_Wait(&request,&stat); free(tmp_array); /* freeing of array */ } } } /**************************************************** set of "initial" coefficients of the contraction ****************************************************/ Second_switch = 1; Simple_InitCnt[0] = 0; Simple_InitCnt[1] = 0; Simple_InitCnt[2] = 0; Simple_InitCnt[3] = 0; Simple_InitCnt[4] = 0; Simple_InitCnt[5] = 0; for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ CntCoes[Mc_AN][al][p] = 0.0; } } al = -1; for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ for (M0=0; M0<=2*L0; M0++){ al++; CntCoes[Mc_AN][al][Mul0] = 1.0; } } } } if (SICnt_switch==2) goto Simple_Init; /**************************************************** Setting of Hamiltonian and overlap matrices MP indicates the starting position of atom i in arraies H and S ****************************************************/ for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; if ( FNAN[Gc_AN]<30 ) scaleF = 1.6; else if ( FNAN[Gc_AN]<40 ) scaleF = 1.4; else if ( FNAN[Gc_AN]<50 ) scaleF = 1.2; else scaleF = 1.0; rc1 = scaleF*Spe_Atom_Cut1[wan]; /*********************************************** MP indicates the starting position of atom i in arraies H and S ***********************************************/ Anum = 1; TZ = 0.0; for (i=0; i<=FNAN[Gc_AN]; i++){ if (Dis[Gc_AN][i]<=rc1){ MP[i] = Anum; Gi = natn[Gc_AN][i]; wanA = WhatSpecies[Gi]; Anum = Anum + Spe_Total_NO[wanA]; TZ = TZ + Spe_Core_Charge[wanA]; } } NUM = Anum - 1; /**************************************************** allocation ****************************************************/ n2 = NUM + 3; koSys = (double*)malloc(sizeof(double)*n2); ko = (double*)malloc(sizeof(double)*n2); abs_sum = (double*)malloc(sizeof(double)*n2); M1 = (double*)malloc(sizeof(double)*n2); dege = (int*)malloc(sizeof(int)*n2); C0 = (double*)malloc(sizeof(double)*n2); S = (double**)malloc(sizeof(double*)*n2); Hks = (double**)malloc(sizeof(double*)*n2); D = (double**)malloc(sizeof(double*)*n2); C = (double**)malloc(sizeof(double*)*n2); B = (double**)malloc(sizeof(double*)*n2); for (i=0; i<n2; i++){ S[i] = (double*)malloc(sizeof(double)*n2); Hks[i] = (double*)malloc(sizeof(double)*n2); D[i] = (double*)malloc(sizeof(double)*n2); C[i] = (double*)malloc(sizeof(double)*n2); B[i] = (double*)malloc(sizeof(double)*n2); } jun = (int*)malloc(sizeof(int)*n2); ponu = (int*)malloc(sizeof(int)*n2); InProd = (double**)malloc(sizeof(double*)*n2); for (i=0; i<n2; i++){ InProd[i] = (double*)malloc(sizeof(double)*n2); } /**************************************************** calc ****************************************************/ if (3<=level_stdout){ printf("<Initial_CntCoes> Mc_AN=%2d Gc_AN=%2d NUM=%2d\n",Mc_AN,Gc_AN,NUM); } for (i=0; i<=FNAN[Gc_AN]; i++){ if (Dis[Gc_AN][i]<=rc1){ ig = natn[Gc_AN][i]; im = S_G2M[ig]; /* S_G2M must be used. */ ian = Spe_Total_NO[WhatSpecies[ig]]; Anum = MP[i]; for (j=0; j<=FNAN[Gc_AN]; j++){ if (Dis[Gc_AN][j]<=rc1){ kl = RMI1[Mc_AN][i][j]; jg = natn[Gc_AN][j]; jan = Spe_Total_NO[WhatSpecies[jg]]; Bnum = MP[j]; if (0<=kl){ for (m=0; m<ian; m++){ for (n=0; n<jan; n++){ S[Anum+m][Bnum+n] = OLP[0][im][kl][m][n]; if (SpinP_switch==0) Hks[Anum+m][Bnum+n] = nh[0][im][kl][m][n]; else Hks[Anum+m][Bnum+n] = 0.5*(nh[0][im][kl][m][n] + nh[1][im][kl][m][n]); } } } else{ for (m=0; m<ian; m++){ for (n=0; n<jan; n++){ S[Anum+m][Bnum+n] = 0.0; Hks[Anum+m][Bnum+n] = 0.0; } } } } } } } if (3<=level_stdout){ printf("\n"); printf("overlap matrix Gc_AN=%2d\n",Gc_AN); for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ printf("%6.3f ",S[i1][j1]); } printf("\n"); } printf("\n"); printf("hamiltonian matrix Gc_AN=%2d\n",Gc_AN); for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ printf("%6.3f ",Hks[i1][j1]); } printf("\n"); } printf("\n"); } /*********************************************** Solve the generalized eigenvalue problem ***********************************************/ Eigen_lapack(S,koSys,NUM,NUM); /*********************************************** Searching of negative eigenvalues ************************************************/ for (l=1; l<=NUM; l++){ if (koSys[l]<0.0){ koSys[l] = 1.0e-7; if (3<=level_stdout){ printf("<Init_CntCoes> Negative EV of OLP %2d %15.12f\n",l,koSys[l]); } } } for (l=1; l<=NUM; l++){ M1[l] = 1.0/sqrt(koSys[l]); } for (i1=1; i1<=NUM; i1++){ for (j1=i1+1; j1<=NUM; j1++){ tmp0 = S[i1][j1]; tmp1 = S[j1][i1]; S[j1][i1] = tmp0; S[i1][j1] = tmp1; } } for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ sum = 0.0; tmp0 = M1[j1]; for (l=1; l<=NUM; l++){ sum = sum + Hks[i1][l]*S[j1][l]*tmp0; } C[j1][i1] = sum; } } for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ sum = 0.0; tmp0 = M1[i1]; for (l=1; l<=NUM; l++){ sum = sum + tmp0*S[i1][l]*C[j1][l]; } B[i1][j1] = sum; } } for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ D[i1][j1] = B[i1][j1]; } } Eigen_lapack(D,koSys,NUM,NUM); /*********************************************** transformation to the original eigenvectors. NOTE 244P ***********************************************/ for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ C[i1][j1] = 0.0; } } for (i1=1; i1<=NUM; i1++){ for (j1=i1+1; j1<=NUM; j1++){ tmp0 = S[i1][j1]; tmp1 = S[j1][i1]; S[j1][i1] = tmp0; S[i1][j1] = tmp1; tmp0 = D[i1][j1]; tmp1 = D[j1][i1]; D[j1][i1] = tmp0; D[i1][j1] = tmp1; } } for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ sum = 0.0; for (l=1; l<=NUM; l++){ sum = sum + S[i1][l]*M1[l]*D[j1][l]; } C[i1][j1] = sum; } } /**************************************************** searching of a local chemical potential ****************************************************/ po = 0; LChemP_MAX = 10.0; LChemP_MIN =-10.0; Beta0 = 1.0/(kB*1500.0/eV2Hartree); do{ LChemP = 0.50*(LChemP_MAX + LChemP_MIN); Num_State = 0.0; for (i1=1; i1<=NUM; i1++){ x = (koSys[i1] - LChemP)*Beta0; if (x<=-30.0) x = -30.0; if (30.0<=x) x = 30.0; FermiF = 2.0/(1.0 + exp(x)); Num_State = Num_State + FermiF; } Dnum = TZ - Num_State; if (0.0<=Dnum) LChemP_MIN = LChemP; else LChemP_MAX = LChemP; if (fabs(Dnum)<0.000000000001) po = 1; } while (po==0); if (3<=level_stdout){ for (i1=1; i1<=NUM; i1++){ x = (koSys[i1] - LChemP)*Beta0; if (x<=-30.0) x = -30.0; if (30.0<=x) x = 30.0; FermiF = 1.0/(1.0 + exp(x)); printf("<Init_CntCoes> %2d eigenvalue=%15.12f FermiF=%15.12f\n", i1,koSys[i1],FermiF); } } if (3<=level_stdout){ printf("\n"); printf("first C Gc_AN=%2d\n",Gc_AN); for (i1=1; i1<=NUM; i1++){ for (j1=1; j1<=NUM; j1++){ printf("%10.6f ",C[i1][j1]); } printf("\n"); } } if (3<=level_stdout){ printf("<Init_CntCoes> LChemP=%15.12f\n",LChemP); } /************************************************ maximize the "overlap" between wave functions and contracted basis functions ************************************************/ /* make a table function converting [L0][Mul0][M0] to "al" for primitive orbitals */ al = -1; for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (M0=0; M0<=2*L0; M0++){ al++; tmp_index1[L0][Mul0][M0] = al; } } } /* make a table function converting [L0][Mul0][M0] to "al" for contracted orbitals */ al = -1; for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ for (M0=0; M0<=2*L0; M0++){ al++; tmp_index2[L0][Mul0][M0] = al; } } } /* loop for L0 */ for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ Hks[Mul0+1][Mul1+1] = 0.0; } } for (M0=0; M0<=2*L0; M0++){ for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ i = tmp_index1[L0][Mul0][M0]; for (mu=1; mu<=NUM; mu++){ InProd[mu][Mul0] = C[MP[0]+i][mu]; } /* mu */ } /* Mul0 */ for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ sum = 0.0; for (mu=1; mu<=NUM; mu++){ x = (koSys[mu] - LChemP)*Beta0; if (x<=-30.0) x = -30.0; if (30.0<=x) x = 30.0; FermiF = 1.0/(1.0 + exp(x)); sum += FermiF*InProd[mu][Mul0]*InProd[mu][Mul1]; } Hks[Mul0+1][Mul1+1] -= sum; } /* for calculation of a single atom */ tmp0 = (double)(Spe_Num_Basis[wan][L0]-Mul0); Hks[Mul0+1][Mul0+1] += -1.0e-9*tmp0*tmp0; } } /* M0 */ /* M0 = 0; for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ i = tmp_index1[L0][Mul0][M0]; for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ j = tmp_index1[L0][Mul1][M0]; S[Mul0+1][Mul1+1] = OLP[0][Mc_AN][0][i][j]; } } */ for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ S[Mul0+1][Mul1+1] = 0.0; } S[Mul0+1][Mul0+1] = 1.0; } if (3<=level_stdout){ printf("<Hks Gc_AN=%2d L0=%2d>\n",Gc_AN,L0); for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ printf("%15.10f ",Hks[Mul0+1][Mul1+1]); } printf("\n"); } } if (3<=level_stdout){ printf("<S Gc_AN=%2d L0=%2d>\n",Gc_AN,L0); for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ printf("%15.10f ",S[Mul0+1][Mul1+1]); } printf("\n"); } } /* diagonalization */ Np = Spe_Num_Basis[wan][L0]; Eigen_lapack(S,ko,Np,Np); for (l=1; l<=Np; l++){ M1[l] = 1.0/sqrt(ko[l]); } for (i1=1; i1<=Np; i1++){ for (j1=1; j1<=Np; j1++){ sum = 0.0; for (l=1; l<=Np; l++){ sum = sum + Hks[i1][l]*S[l][j1]*M1[j1]; } C[i1][j1] = sum; } } for (i1=1; i1<=Np; i1++){ for (j1=1; j1<=Np; j1++){ sum = 0.0; for (l=1; l<=Np; l++){ sum = sum + M1[i1]*S[l][i1]*C[l][j1]; } B[i1][j1] = sum; } } for (i1=1; i1<=Np; i1++){ for (j1=1; j1<=Np; j1++){ D[i1][j1] = B[i1][j1]; } } Eigen_lapack(D,ko,Np,Np); /* transformation to the original eigenvectors */ for (i1=1; i1<=Np; i1++){ for (j1=1; j1<=Np; j1++){ C[i1][j1] = 0.0; } } for (i1=1; i1<=Np; i1++){ for (j1=1; j1<=Np; j1++){ sum = 0.0; for (l=1; l<=Np; l++){ sum = sum + S[i1][l]*M1[l]*D[l][j1]; } C[i1][j1] = sum; } } if (3<=level_stdout){ printf("<Eigenvalues Gc_AN=%2d L0=%2d>\n",Gc_AN,L0); for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ printf("Mul=%2d ko=%15.12f\n",Mul0,ko[Mul0+1]); } printf("<C Gc_AN=%2d L0=%2d>\n",Gc_AN,L0); for (Mul0=0; Mul0<Spe_Num_Basis[wan][L0]; Mul0++){ for (Mul1=0; Mul1<Spe_Num_Basis[wan][L0]; Mul1++){ printf("%15.10f ",C[Mul0+1][Mul1+1]); } printf("\n"); } } /* set up contraction coefficients */ for (M0=0; M0<=2*L0; M0++){ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ al = tmp_index2[L0][Mul0][M0]; /* if (SCnt_switch==1) */ if ( SCnt_switch==1 && Mul0==(Spe_Num_CBasis[wan][L0]-1) ){ for (p=0; p<Spe_Num_Basis[wan][L0]; p++){ CntCoes[Mc_AN][al][p] = C[p+1][1]; } } else { for (p=0; p<Spe_Num_Basis[wan][L0]; p++){ CntCoes[Mc_AN][al][p] = C[p+1][Mul0+1];; } } maxc = -1.0e+10; for (p=0; p<Spe_Num_Basis[wan][L0]; p++){ if (maxc<fabs(CntCoes[Mc_AN][al][p])){ maxc = fabs(CntCoes[Mc_AN][al][p]); maxp = p; } } tmp0 = sgn(CntCoes[Mc_AN][al][maxp]); for (p=0; p<Spe_Num_Basis[wan][L0]; p++){ CntCoes[Mc_AN][al][p] *= tmp0; } } } } /* L0 */ if (3<=level_stdout){ for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ printf("A Init_CntCoes Mc_AN=%2d Gc_AN=%2d al=%2d p=%2d %15.12f\n", Mc_AN,Gc_AN,al,p,CntCoes[Mc_AN][al][p]); } } } /**************************************************** free arrays ****************************************************/ for (i=0; i<n2; i++){ free(InProd[i]); } free(InProd); free(koSys); free(ko); free(abs_sum); free(M1); free(dege); free(jun); free(ponu); free(C0); for (i=0; i<n2; i++){ free(S[i]); free(Hks[i]); free(D[i]); free(C[i]); free(B[i]); } free(S); free(Hks); free(D); free(C); free(B); } /* Mc_AN */ /************************************************************* in case of optimization of only the last orbital in each L *************************************************************/ if (SCnt_switch==1){ for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; al = -1; for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ for (M0=0; M0<=2*L0; M0++){ al++; if ( Mul0!=(Spe_Num_CBasis[wan][L0]-1) ){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ CntCoes[Mc_AN][al][p] = 0.0; } CntCoes[Mc_AN][al][Mul0] = 1.0; } } } } } } /**************************************************** average contraction coefficients ****************************************************/ if (ACnt_switch==1){ /* allocation */ My_CntCoes_Spe = (double***)malloc(sizeof(double**)*(SpeciesNum+1)); for (i=0; i<=SpeciesNum; i++){ My_CntCoes_Spe[i] = (double**)malloc(sizeof(double*)*List_YOUSO[7]); for (j=0; j<List_YOUSO[7]; j++){ My_CntCoes_Spe[i][j] = (double*)malloc(sizeof(double)*List_YOUSO[24]); } } CntCoes_Spe = (double***)malloc(sizeof(double**)*(SpeciesNum+1)); for (i=0; i<=SpeciesNum; i++){ CntCoes_Spe[i] = (double**)malloc(sizeof(double*)*List_YOUSO[7]); for (j=0; j<List_YOUSO[7]; j++){ CntCoes_Spe[i][j] = (double*)malloc(sizeof(double)*List_YOUSO[24]); } } /* initialize */ for (wan=0; wan<SpeciesNum; wan++){ for (i=0; i<List_YOUSO[7]; i++){ for (j=0; j<List_YOUSO[24]; j++){ My_CntCoes_Spe[wan][i][j] = 0.0; } } } /* local sum in a proccessor */ for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ My_CntCoes_Spe[wan][al][p] += CntCoes[Mc_AN][al][p]; } } } /* global sum by MPI */ for (wan=0; wan<SpeciesNum; wan++){ for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ MPI_Allreduce(&My_CntCoes_Spe[wan][al][p], &CntCoes_Spe[wan][al][p], 1, MPI_DOUBLE, MPI_SUM, mpi_comm_level1); } } } /* copy CntCoes_Spe to CntCoes_Species */ for (wan=0; wan<SpeciesNum; wan++){ for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ CntCoes_Species[wan][al][p] = CntCoes_Spe[wan][al][p]; } } } /* CntCoes_Spe to CntCoes */ for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ CntCoes[Mc_AN][al][p] = CntCoes_Spe[wan][al][p]; } } } /* free */ for (i=0; i<=SpeciesNum; i++){ for (j=0; j<List_YOUSO[7]; j++){ free(My_CntCoes_Spe[i][j]); } free(My_CntCoes_Spe[i]); } free(My_CntCoes_Spe); for (i=0; i<=SpeciesNum; i++){ for (j=0; j<List_YOUSO[7]; j++){ free(CntCoes_Spe[i][j]); } free(CntCoes_Spe[i]); } free(CntCoes_Spe); } /********************************************** transformation of optimized orbitals by an extended Gauss elimination and the Gram-Schmidt orthogonalization ***********************************************/ for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; al = -1; for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ for (M0=0; M0<=2*L0; M0++){ al++; tmp_index2[L0][Mul0][M0] = al; } } } for (L0=0; L0<=Spe_MaxL_Basis[wan]; L0++){ for (M0=0; M0<=2*L0; M0++){ /********************************************** extended Gauss elimination ***********************************************/ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ al0 = tmp_index2[L0][Mul0][M0]; for (Mul1=0; Mul1<Spe_Num_CBasis[wan][L0]; Mul1++){ al1 = tmp_index2[L0][Mul1][M0]; if (Mul1!=Mul0){ tmp0 = CntCoes[Mc_AN][al0][Mul0]; tmp1 = CntCoes[Mc_AN][al1][Mul0]; for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ CntCoes[Mc_AN][al1][p] -= CntCoes[Mc_AN][al0][p]/tmp0*tmp1; } } } } /********************************************** orthonormalization of initial contraction coefficients ***********************************************/ for (Mul0=0; Mul0<Spe_Num_CBasis[wan][L0]; Mul0++){ al0 = tmp_index2[L0][Mul0][M0]; /* x - sum_i <x|e_i>e_i */ for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ Tmp_CntCoes[p] = 0.0; } for (Mul1=0; Mul1<Mul0; Mul1++){ al1 = tmp_index2[L0][Mul1][M0]; sum = 0.0; for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ sum = sum + CntCoes[Mc_AN][al0][p]*CntCoes[Mc_AN][al1][p]; } for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ Tmp_CntCoes[p] = Tmp_CntCoes[p] + sum*CntCoes[Mc_AN][al1][p]; } } for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ CntCoes[Mc_AN][al0][p] = CntCoes[Mc_AN][al0][p] - Tmp_CntCoes[p]; } /* Normalize */ sum = 0.0; Max0 = -100.0; pmax = 0; for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ sum = sum + CntCoes[Mc_AN][al0][p]*CntCoes[Mc_AN][al0][p]; if (Max0<fabs(CntCoes[Mc_AN][al0][p])){ Max0 = fabs(CntCoes[Mc_AN][al0][p]); pmax = p; } } if (fabs(sum)<1.0e-11) tmp0 = 0.0; else tmp0 = 1.0/sqrt(sum); tmp1 = sgn(CntCoes[Mc_AN][al0][pmax]); for (p=0; p<Spe_Specified_Num[wan][al0]; p++){ CntCoes[Mc_AN][al0][p] = tmp0*tmp1*CntCoes[Mc_AN][al0][p]; } } } } } /* Mc_AN */ /**************************************************** Normalization ****************************************************/ for (Mc_AN=1; Mc_AN<=Matomnum; Mc_AN++){ Gc_AN = M2G[Mc_AN]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ sum = 0.0; for (p=0; p<Spe_Specified_Num[wan][al]; p++){ p0 = Spe_Trans_Orbital[wan][al][p]; for (q=0; q<Spe_Specified_Num[wan][al]; q++){ q0 = Spe_Trans_Orbital[wan][al][q]; tmp0 = CntCoes[Mc_AN][al][p]*CntCoes[Mc_AN][al][q]; sum = sum + tmp0*OLP[0][Mc_AN][0][p0][q0]; } } tmp0 = 1.0/sqrt(sum); for (p=0; p<Spe_Specified_Num[wan][al]; p++){ CntCoes[Mc_AN][al][p] = CntCoes[Mc_AN][al][p]*tmp0; } } if (3<=level_stdout){ for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ printf("B Init_CntCoes Mc_AN=%2d Gc_AN=%2d al=%2d p=%2d %15.12f\n", Mc_AN,Gc_AN,al,p,CntCoes[Mc_AN][al][p]); } } } } /* Mc_AN */ Simple_Init: /**************************************************** MPI: CntCoes_Species ****************************************************/ for (wan=0; wan<SpeciesNum; wan++){ Gc_AN = 1; po = 0; do { wanA = WhatSpecies[Gc_AN]; if (wan==wanA){ ID = G2ID[Gc_AN]; Mc_AN = F_G2M[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ if (ID==myid) tmp0 = CntCoes[Mc_AN][al][p]; MPI_Bcast(&tmp0, 1, MPI_DOUBLE, ID, mpi_comm_level1); CntCoes_Species[wan][al][p] = tmp0; } } po = 1; } Gc_AN++; } while (po==0 && Gc_AN<=atomnum); } /**************************************************** MPI: CntCoes ****************************************************/ /*********************************** set data size ************************************/ for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; if (ID!=0){ tag = 999; /* find data size to send block data */ if (F_Snd_Num[IDS]!=0){ size1 = 0; for (n=0; n<F_Snd_Num[IDS]; n++){ Mc_AN = Snd_MAN[IDS][n]; Gc_AN = Snd_GAN[IDS][n]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ size1 += Spe_Specified_Num[wan][al]; } } Snd_CntCoes_Size[IDS] = size1; MPI_Isend(&size1, 1, MPI_INT, IDS, tag, mpi_comm_level1, &request); } else{ Snd_CntCoes_Size[IDS] = 0; } /* receiving of size of data */ if (F_Rcv_Num[IDR]!=0){ MPI_Recv(&size2, 1, MPI_INT, IDR, tag, mpi_comm_level1, &stat); Rcv_CntCoes_Size[IDR] = size2; } else{ Rcv_CntCoes_Size[IDR] = 0; } if (F_Snd_Num[IDS]!=0) MPI_Wait(&request,&stat); } else { Snd_CntCoes_Size[myid] = 0; Rcv_CntCoes_Size[myid] = 0; } } /*********************************** data transfer ************************************/ tag = 999; for (ID=0; ID<numprocs; ID++){ IDS = (myid + ID) % numprocs; IDR = (myid - ID + numprocs) % numprocs; if (ID!=0){ /***************************** sending of data *****************************/ if (F_Snd_Num[IDS]!=0){ size1 = Snd_CntCoes_Size[IDS]; /* allocation of array */ tmp_array = (double*)malloc(sizeof(double)*size1); /* multidimentional array to vector array */ num = 0; for (n=0; n<F_Snd_Num[IDS]; n++){ Mc_AN = Snd_MAN[IDS][n]; Gc_AN = Snd_GAN[IDS][n]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ tmp_array[num] = CntCoes[Mc_AN][al][p]; num++; } } } MPI_Isend(&tmp_array[0], size1, MPI_DOUBLE, IDS, tag, mpi_comm_level1, &request); } /***************************** receiving of block data *****************************/ if (F_Rcv_Num[IDR]!=0){ size2 = Rcv_CntCoes_Size[IDR]; /* allocation of array */ tmp_array2 = (double*)malloc(sizeof(double)*size2); MPI_Recv(&tmp_array2[0], size2, MPI_DOUBLE, IDR, tag, mpi_comm_level1, &stat); num = 0; Mc_AN = F_TopMAN[IDR] - 1; for (n=0; n<F_Rcv_Num[IDR]; n++){ Mc_AN++; Gc_AN = Rcv_GAN[IDR][n]; wan = WhatSpecies[Gc_AN]; for (al=0; al<Spe_Total_CNO[wan]; al++){ for (p=0; p<Spe_Specified_Num[wan][al]; p++){ CntCoes[Mc_AN][al][p] = tmp_array2[num]; num++; } } } /* freeing of array */ free(tmp_array2); } if (F_Snd_Num[IDS]!=0){ MPI_Wait(&request,&stat); free(tmp_array); /* freeing of array */ } } } /**************************************************** freeing of arrays: int MP[List_YOUSO[8]]; int tmp_index[List_YOUSO[25]+1] [2*(List_YOUSO[25]+1)+1]; int tmp_index1[List_YOUSO[25]+1] [List_YOUSO[24]] [2*(List_YOUSO[25]+1)+1]; int tmp_index2[List_YOUSO[25]+1] [List_YOUSO[24]] [2*(List_YOUSO[25]+1)+1]; double Tmp_CntCoes[List_YOUSO[24]] double Check_ko[List_YOUSO[25]+1] [2*(List_YOUSO[25]+1)+1]; double Weight_ko[List_YOUSO[7]]; ****************************************************/ free(MP); for (i=0; i<(List_YOUSO[25]+1); i++){ free(tmp_index[i]); } free(tmp_index); for (i=0; i<(List_YOUSO[25]+1); i++){ for (j=0; j<List_YOUSO[24]; j++){ free(tmp_index1[i][j]); } free(tmp_index1[i]); } free(tmp_index1); for (i=0; i<(List_YOUSO[25]+1); i++){ for (j=0; j<List_YOUSO[24]; j++){ free(tmp_index2[i][j]); } free(tmp_index2[i]); } free(tmp_index2); free(Tmp_CntCoes); for (i=0; i<(List_YOUSO[25]+1); i++){ free(Check_ko[i]); } free(Check_ko); free(Weight_ko); free(Snd_CntCoes_Size); free(Rcv_CntCoes_Size); free(Snd_H_Size); free(Rcv_H_Size); free(Snd_S_Size); free(Rcv_S_Size); /* for elapsed time */ dtime(&TEtime); time0 = TEtime - TStime; }
void interface_flux(double *qq, double *fstar, double *ib, double speed) { int ii, ee; double qb[2 * ne]; for (ii = 0; ii < 2 * ne; ii++) { fstar[ii] = 0; } // fstar[0:ne-1] stores numerical flux of the left edge on each elements // fstar[ne:2*ne-1] stores numerical flux of the right edge on each elements for (ii = 0; ii < ne; ii++) { qb[ii] = dot_product(ib, qq + ii * np, np); // left edge interpolated value of qq at element ii qb[ne + ii] = dot_product(ib + np, qq + ii * np, np); // right edge interpolated value of qq at element ii } double qb_temp; if (rank == 0) { MPI_Isend(qb + 2 * ne - 1, 1, MPI_DOUBLE, rank + 1, 101, MPI_COMM_WORLD, &ser1); MPI_Irecv(&qb_temp, 1, MPI_DOUBLE, nprocs - 1, 101, MPI_COMM_WORLD, &rer1); } else if (rank == nprocs - 1) { MPI_Isend(qb + 2 * ne - 1, 1, MPI_DOUBLE, 0, 101, MPI_COMM_WORLD, &ser1); MPI_Irecv(&qb_temp, 1, MPI_DOUBLE, rank - 1, 101, MPI_COMM_WORLD, &rer1); } else { MPI_Isend(qb + 2 * ne - 1, 1, MPI_DOUBLE, rank + 1, 101, MPI_COMM_WORLD, &ser1); MPI_Irecv(&qb_temp, 1, MPI_DOUBLE, rank - 1, 101, MPI_COMM_WORLD, &rer1); } MPI_Wait(&ser1, &st); MPI_Wait(&rer1, &st); double f_temp; ii = 0; // calculating numerical flux (fstar) with periodic boundary condition fstar[ii] = -((qb_temp + qb[ii]) / 2 * speed + fabs(speed) * (qb_temp - qb[ii]) / 2); if (rank == 0){ MPI_Isend(fstar, 1, MPI_DOUBLE, nprocs - 1, 202, MPI_COMM_WORLD, &ser2); MPI_Irecv(fstar + 2 * ne - 1, 1, MPI_DOUBLE, rank + 1, 202, MPI_COMM_WORLD, &rer2); } else if (rank == nprocs - 1) { MPI_Isend(fstar, 1, MPI_DOUBLE, rank - 1, 202, MPI_COMM_WORLD, &ser2); MPI_Irecv(fstar + 2 * ne - 1, 1, MPI_DOUBLE, 0, 202, MPI_COMM_WORLD, &rer2); } else { MPI_Isend(fstar, 1, MPI_DOUBLE, rank - 1, 202, MPI_COMM_WORLD, &ser2); MPI_Irecv(fstar + 2 * ne - 1, 1, MPI_DOUBLE, rank + 1, 202, MPI_COMM_WORLD, &rer2); } MPI_Wait(&ser2, &st); MPI_Wait(&rer2, &st); for (ii = 1; ii < ne; ii++) { fstar[ii] = -((qb[ne + ii - 1] + qb[ii]) / 2 * speed + fabs(speed) * (qb[ne + ii - 1] - qb[ii]) / 2); fstar[ne + ii - 1] = -fstar[ii]; } fstar[ne + ne - 1] *= -1; }
int main(int argc, char **argv) { MPI_Comm cartComm; //Cartesian communicator int tid; //Thread id int nthreads; //Number of threads double time_initial; //Start time double time_end; //End time int n; //N is the size of the matrix //Wrap around int wrapAround=1; #if defined(USE_BROADCAST_ASYNC) //If asynchronus broadcast is enabled, keep a request for testing if the data can be safely modified after being sent MPI_Request bcastRequest; #endif //Initialize the MPI environment if(MPI_Init(NULL,NULL)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Get number of threads if(MPI_Comm_size(MPI_COMM_WORLD, &nthreads)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Create one dimensional cartesian grouping that is NOT ring-shaped (does not wrap around) //??? Make it wrap around and add a note to the message that tells it to stop when it has reached the start? if(MPI_Cart_create(MPI_COMM_WORLD,1,&nthreads,&wrapAround,1,&cartComm)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Get number of threads if(MPI_Comm_size(cartComm, &nthreads)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Get thread id if(MPI_Comm_rank(cartComm, &tid)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } int destinationN; int destinationP; MPI_Cart_shift(cartComm,0,1,&destinationP,&destinationN); //Set the size of the matrix n=kappa*nthreads; //Initialize rand srand(time(NULL)+tid); //Create the matrix and split it amongs the threads double ** matrPart; //Holds this thread's part of the matrix int partSize; createMatrix(cartComm,tid,nthreads,n,&matrPart,&partSize); #ifdef __DEBUG__MODE_EX1__ if(tid==0) { ////cout<<"------------------------------------------"<<endl; } printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif //Create a cache for optimization //This ensures that there is no difference due to how expensive the functions used to find the right collumn or processor are #ifdef __QuestionExtra__ int colnum = n*2; #else int colnum = n+1; #endif int* thrForCol=malloc(sizeof(int)*colnum); //Tells us which thread each collumn belongs to for (int i=0;i<colnum;++i) { thrForCol[i]=threadForCollumn(nthreads,n,i); } bool* colValidForThr=malloc(sizeof(bool)*colnum); //Tells us if the coolumn selected is valid for the current thread for (int i=0;i<colnum;++i) { colValidForThr[i]=(thrForCol[i]==tid); } int* glColToPartCol=malloc(sizeof(int)*colnum);//Holds the part collumn for the global collumn given (-1 if invalid) for (int i=0;i<colnum;++i) { if(colValidForThr[i]) { glColToPartCol[i]=globColToPartCol(tid,nthreads,n,i); } else { glColToPartCol[i]=-1; } } int* ptColToGlobCol=malloc(sizeof(int)*partSize);//Holds the global collumn for the part column given for (int i=0;i<partSize;++i) { ptColToGlobCol[i]=partColToGlobCol(tid,nthreads,n,i); } //If this is computing the inverse matrix #ifdef __QuestionExtra__ bool* inInverseMatrix=malloc(sizeof(bool)*partSize); //True if in the inverse matrix for (int i=0;i<partSize;++i) { inInverseMatrix[i]=(ptColToGlobCol[i]>=n); } #endif //Set the active diagonal to 0 int k=0; int kapOwner; if(tid==0) { //Get the start time time_initial = MPI_Wtime(); } //Start solving while(k<n) { kapOwner=thrForCol[k]; //If this is the owner of kappa if(tid==kapOwner) { //Get the collumn you need int curCol=glColToPartCol[k]; //For row k, divide it so that it becomes 1 and send what you divided it with to the other rows //First send what we need to do to it to the other threads (which is [k,k]) //(Data sent is number to divide with (the other threads should have the correct k and sender)) #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][k]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][k]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][k]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif //Then divide with that number for(int jj=curCol+1;jj<partSize;++jj) { matrPart[jj][k]=matrPart[jj][k]/matrPart[curCol][k]; } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif matrPart[curCol][k]=1; //No need to do a real division for the first element //Then for all rows, subtract and send what we are multiplying to subtract to the other threads for(int i=k+1;i<n;++i) { //First send #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][i]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif //For all partcollumns, check to see if we can subtract anything //(their global col must be greater than k and current collumn) for(int jj=curCol+1;jj<partSize;++jj) { matrPart[jj][i]=matrPart[jj][i]-matrPart[jj][k]*matrPart[curCol][i]; } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif //Then subtract matrPart[curCol][i]=0; //NO need to do real subtraction for the first element } } //Else, if this is not the owner of kappa else { //Used for optimisation bool isValid=false; bool isValidArr[partSize]; for(int j=0;j<partSize;++j) { if(ptColToGlobCol[j]>k) { isValid=true; isValidArr[j]=true; } else { isValidArr[j]=false; } } //First receive the number you need to divide k row with and send it to the next one //(unless next one is sender) double recD; #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif //Then divide k row if necessary if(isValid) { for(int j=0;j<partSize;++j) { if(isValidArr[j]) { matrPart[j][k]=matrPart[j][k]/recD; } } } //Then for all rows below k row, receive what we need to multiply the subtraction with //and do that if necessary for(int i=k+1;i<n;++i) { #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif if(isValid) { for(int j=0;j<partSize;++j) { if(isValidArr[j]) { matrPart[j][i]=matrPart[j][i]-recD*matrPart[j][k]; } } } } } //Finally, increment k ++k; #ifdef __DEBUG__MODE_EX1__ printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif } k=n-1; #ifdef __QuestionExtra__ //IF THIS IS COMPUTING THE INVERSE MATRIX while(k>0) { kapOwner=thrForCol[k]; //If this is the owner of kappa if(tid==kapOwner) { //Get the collumn you need int curCol=glColToPartCol[k]; for(int i=k-1;i>=0;--i) { #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][i]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif for(int j=curCol+1;j<partSize;++j) { //If this is in the inverse matrix if(inInverseMatrix[j]) { matrPart[j][i]=matrPart[j][i]-matrPart[j][k]*matrPart[curCol][i]; } } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif matrPart[curCol][i]=0; //No need to do real subtraction. } } //Else, if this is not the owner of kappa else { //for all rows above k row, receive what we need to multiply the subtraction with //and do that if necessary double recD; for(int i=k-1;i>=0;--i) { #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { //Pass it along to the next thread MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif //For all collumns for(int j=0;j<partSize;++j) { //If this is in the inverse matrix if(inInverseMatrix[j]) { matrPart[j][i]=matrPart[j][i]-recD*matrPart[j][k]; } } } } //Finally, decrement kappa --k; #ifdef __DEBUG__MODE_EX1__ printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif } #else //If this is not computing the inverse matrix but doing elimination while(k>0) { //Used for optimisation int endCol; bool isValid=colValidForThr[n]; if(isValid) { endCol=glColToPartCol[n]; } kapOwner=thrForCol[k]; //If this is the owner of kappa if(tid==kapOwner) { //Get the collumn you need int curCol=glColToPartCol[k]; for(int i=k-1;i>=0;--i) { #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][i]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif if(isValid) { matrPart[endCol][i]=matrPart[endCol][i]-matrPart[endCol][k]*matrPart[curCol][i]; } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif matrPart[curCol][i]=0; //No need to do real subtraction. } } //Else, if this is not the owner of kappa else { //for all rows above k row, receive what we need to multiply the subtraction with //and do that if necessary double recD; for(int i=k-1;i>=0;--i) { #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif if(isValid) { matrPart[endCol][i]=matrPart[endCol][i]-recD*matrPart[endCol][k]; } } } //Finally, decrement kappa --k; #ifdef __DEBUG__MODE_EX1__ printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif } #endif if(tid==0) { //Get the end time time_end = MPI_Wtime(); } #ifdef __DEBUG__MODE_EX1__ //Print the solution printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); #endif if(tid==0) { #ifdef __DEBUG__MODE_EX1__ //Write some info //cout<<"Solved in "<<(time_end-time_initial)<<" seconds in "<<nthreads<<" threads using configuration "; #ifdef __Question1__ //cout<<"1:\"serial\""<<endl; #endif #ifdef __Question2__ //cout<<"2:\"shuffle\""<<endl; #endif #else /*if(isnan(matrPart[0][0])) { //cout<<"INVALID MATRIX: NAN"<<endl; } else {*/ printf("%.20f",(time_end-time_initial)); ////cout<<fixed<<setprecision(20)<<(time_end-time_initial)<<endl; //} #endif } //Delete data for(int j=0;j<partSize;++j) { free(matrPart[j]); } free(matrPart); //Delete cache free(thrForCol); free(colValidForThr); free(glColToPartCol); free(ptColToGlobCol); #ifdef __QuestionExtra__ free(inInverseMatrix); #endif //Finalize the MPI environment if(MPI_Finalize()!=MPI_SUCCESS) { ////cerr<<tid<<" ERROR"<<endl; } //Exit return EXIT_SUCCESS; }
/* Provided global variables are MAXN, N, procs, A[][], B[], and X[], * defined in the beginning of this code. X[] is initialized to zeros. */ void gauss() { MPI_Status status; MPI_Request request; int norm, row, col, i; /* Normalization row, and zeroing element row and col */ float multiplier; /*Time Variables*/ double startwtime = 0.0, endwtime; MPI_Barrier(MPI_COMM_WORLD); if (myid == 0) { printf("\nComputing Parallely Using MPI.\n"); startwtime = MPI_Wtime(); } /* Gaussian elimination */ for (norm = 0; norm < N - 1; norm++) { /* Broadcast A[norm] row and B[norm]*/ MPI_Bcast(&A[norm][0], N, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&B[norm], 1, MPI_FLOAT, 0, MPI_COMM_WORLD); /*Send data from process 0 to other processes*/ if (myid == 0) { for (i = 1; i < procs; i++) { /*Send data to corresponding process using static interleaved scheduling*/ for (row = norm + 1 + i; row < N; row += procs) { MPI_Isend(&A[row], N, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &request); MPI_Wait(&request, &status); MPI_Isend(&B[row], 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &request); MPI_Wait(&request, &status); } } /*Gaussian elimination*/ for (row = norm + 1; row < N; row += procs) { multiplier = A[row][norm] / A[norm][norm]; for (col = norm; col < N; col++) { A[row][col] -= A[norm][col] * multiplier; } B[row] -= B[norm] * multiplier; } /*Receive the updated data from other processes*/ for (i = 1; i < procs; i++) { for (row = norm + 1 + i; row < N; row += procs) { MPI_Recv(&A[row], N, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &status); MPI_Recv(&B[row], 1, MPI_FLOAT, i, 1, MPI_COMM_WORLD, &status); } } if (norm == N - 2) { endwtime = MPI_Wtime(); printf("elapsed time = %f\n", endwtime - startwtime); } } /*Receive data from process 0*/ else { for (row = norm + 1 + myid; row < N; row += procs) { MPI_Recv(&A[row], N, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); MPI_Recv(&B[row], 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD, &status); /*Gaussian elimination*/ multiplier = A[row][norm] / A[norm][norm]; for (col = norm; col < N; col++) { A[row][col] -= A[norm][col] * multiplier; } B[row] -= B[norm] * multiplier; /*Send back the results*/ MPI_Isend(&A[row], N, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &request); MPI_Wait(&request, &status); MPI_Isend(&B[row], 1, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, &request); MPI_Wait(&request, &status); } } /*Barrier syncs all processes*/ MPI_Barrier(MPI_COMM_WORLD); } }
/** * \brief Espera pela conclusão do init_update. * * O método invoca a primitiva MPI_Wait a fim de bloquear o processo e garantir * a conclusão do MPI_Irecv, disparado pelo init_update. */ void Halo_Left::sync(void) { // Ignora Halos não sobrepostos. if (this->rank_neighbor == MPI_PROC_NULL) { return; } int pos = 0; if (this->transfer()) { // Simula latência de rede //usleep(500); ////////////////////////// MPI_Wait(&(this->request), MPI_STATUS_IGNORE); for (int i = this->start_row; i <= this->end_row; i++) { MPI_Unpack(this->buff[0], // inbuf this->num_halos * this->halo_size * sizeof (double), // insize &pos, // position this->data_source[i] + HALO_THICKNESS, // outbuff this->halo_size, // outcount (number of items to be unpacked) MPI_DOUBLE, // datatype this->comm); // comm } if (this->num_halos > 1) { memcpy(this->data_dest[0], this->data_source[0], this->num_rows * this->num_cols * sizeof (double)); this->num_iterators_ctrl = 0; if (this->corner_top.buff != NULL) { // Copia corner top for (int i = this->corner_top.start_row, k = 0; i <= this->corner_top.end_row; i++, k++) { for (int j = this->corner_top.start_col, l = this->start_row; j <= this->corner_top.end_col; j++, l++) { this->data_source[l][k] = this->data_dest[l][k] = corner_top.buff[i][j]; } } } // Copia corner bottom if (this->corner_bottom.buff != NULL) { for (int i = this->corner_bottom.start_row, k = this->end_col + 1; i <= this->corner_bottom.end_row; i++, k++) { for (int j = this->corner_bottom.start_col, l = this->start_row; j <= this->corner_bottom.end_col; j++, l++) { this->data_source[l][k] = this->data_dest[l][k] = corner_bottom.buff[i][j]; } } } } } else { pthread_join(this->thread_id, NULL); this->num_iterators_ctrl++; } return; }
static int scr_swap_files_move( int have_outgoing, const char* file_send, scr_meta* meta_send, int rank_send, uLong* crc32_send, int have_incoming, const char* file_recv, scr_meta* meta_recv, int rank_recv, uLong* crc32_recv, MPI_Comm comm) { int rc = SCR_SUCCESS; MPI_Request request[2]; MPI_Status status[2]; /* allocate MPI send buffer */ char *buf_send = NULL; if (have_outgoing) { buf_send = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_send == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* allocate MPI recv buffer */ char *buf_recv = NULL; if (have_incoming) { buf_recv = (char*) scr_align_malloc(scr_mpi_buf_size, scr_page_size); if (buf_recv == NULL) { scr_abort(-1, "Allocating memory: malloc(%ld) errno=%d %s @ %s:%d", scr_mpi_buf_size, errno, strerror(errno), __FILE__, __LINE__ ); return SCR_FAILURE; } } /* since we'll overwrite our send file in place with the recv file, * which may be larger, we need to keep track of how many bytes we've * sent and whether we've sent them all */ unsigned long filesize_send = 0; /* open our file */ int fd = -1; if (have_outgoing) { /* we'll overwrite our send file (or just read it if there is no incoming) */ filesize_send = scr_file_size(file_send); fd = scr_open(file_send, O_RDWR); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for send/recv: scr_open(%s, O_RDWR) errno=%d %s @ %s:%d", file_send, errno, strerror(errno), __FILE__, __LINE__ ); } } else if (have_incoming) { /* if we're in this branch, then we only have an incoming file, * so we'll write our recv file from scratch */ mode_t mode_file = scr_getmode(1, 1, 0); fd = scr_open(file_recv, O_WRONLY | O_CREAT | O_TRUNC, mode_file); if (fd < 0) { /* TODO: skip writes and return error? */ scr_abort(-1, "Opening file for recv: scr_open(%s, O_WRONLY | O_CREAT | O_TRUNC, ...) errno=%d %s @ %s:%d", file_recv, errno, strerror(errno), __FILE__, __LINE__ ); } } /* exchange file chunks */ int sending = 0; if (have_outgoing) { sending = 1; } int receiving = 0; if (have_incoming) { receiving = 1; } int nread, nwrite; off_t read_pos = 0, write_pos = 0; while (sending || receiving) { if (receiving) { /* prepare a buffer to receive up to scr_mpi_buf_size bytes */ MPI_Irecv(buf_recv, scr_mpi_buf_size, MPI_BYTE, rank_recv, 0, comm, &request[0]); } if (sending) { /* compute number of bytes to read */ unsigned long count = filesize_send - read_pos; if (count > scr_mpi_buf_size) { count = scr_mpi_buf_size; } /* read a chunk of up to scr_mpi_buf_size bytes into buf_send */ lseek(fd, read_pos, SEEK_SET); /* seek to read position */ nread = scr_read(file_send, fd, buf_send, count); if (scr_crc_on_copy && nread > 0) { *crc32_send = crc32(*crc32_send, (const Bytef*) buf_send, (uInt) nread); } if (nread < 0) { nread = 0; } read_pos += (off_t) nread; /* update read pointer */ /* send chunk (if nread is smaller than scr_mpi_buf_size, * then we've read the whole file) */ MPI_Isend(buf_send, nread, MPI_BYTE, rank_send, 0, comm, &request[1]); MPI_Wait(&request[1], &status[1]); /* check whether we've read the whole file */ if (filesize_send == read_pos && count < scr_mpi_buf_size) { sending = 0; } } if (receiving) { /* count the number of bytes received */ MPI_Wait(&request[0], &status[0]); MPI_Get_count(&status[0], MPI_BYTE, &nwrite); if (scr_crc_on_copy && nwrite > 0) { *crc32_recv = crc32(*crc32_recv, (const Bytef*) buf_recv, (uInt) nwrite); } /* write those bytes to file (if nwrite is smaller than scr_mpi_buf_size, * then we've received the whole file) */ lseek(fd, write_pos, SEEK_SET); /* seek to write position */ scr_write(file_recv, fd, buf_recv, nwrite); write_pos += (off_t) nwrite; /* update write pointer */ /* if nwrite is smaller than scr_mpi_buf_size, * then assume we've received the whole file */ if (nwrite < scr_mpi_buf_size) { receiving = 0; } } } /* close file and cleanup */ if (have_outgoing && have_incoming) { /* sent and received a file; close it, truncate it to corect size, rename it */ scr_close(file_send, fd); truncate(file_send, write_pos); rename(file_send, file_recv); } else if (have_outgoing) { /* only sent a file; close it, delete it, and remove its completion marker */ scr_close(file_send, fd); scr_file_unlink(file_send); } else if (have_incoming) { /* only received a file; just need to close it */ scr_close(file_recv, fd); } if (scr_crc_on_copy && have_outgoing) { uLong meta_send_crc; if (scr_meta_get_crc32(meta_send, &meta_send_crc) != SCR_SUCCESS) { /* we transfer this meta data across below, * so may as well update these fields so we can use them */ scr_meta_set_crc32(meta_send, *crc32_send); /* do not complete file send, we just deleted it above */ } else { /* TODO: we could check that the crc on the sent file matches and take some action if not */ } } /* free the MPI buffers */ scr_align_free(&buf_recv); scr_align_free(&buf_send); return rc; }