/* This is the only function that is common to client and server part */ void armci_mpi_strided2(int op, void *ptr, int stride_levels, int stride_arr[], int count[], int proc, MPI_Comm comm) { int i, stride=1; MPI_Status status; MPI_Datatype type[MAX_STRIDE_LEVEL]; if(stride_levels == 0) { armci_mpi_strided(op, ptr, stride_levels, stride_arr, count, proc, comm); return; } /* convert stided data desciption to MPI type */ type[0] = MPI_BYTE; for(i=1; i<=stride_levels; i++) { stride *= stride_arr[i-1]; MPI_Check( MPI_Type_hvector(count[i], count[i-1], stride, type[i-1], &type[i]) ); } MPI_Check( MPI_Type_commit(&type[stride_levels]) ); if(op == SEND) { MPI_Check( MPI_Send(ptr, 1, type[stride_levels], proc, ARMCI_MPI_SPAWN_VDATA_TAG, comm) ); } else /* ( op == RECV) */ { MPI_Check( MPI_Recv(ptr, 1, type[stride_levels], proc, ARMCI_MPI_SPAWN_VDATA_TAG, comm, &status) ); } }
static MPI_Offset writeToMPI(const std::vector<T>& data, MPI_File f, MPI_Offset base, MPI_Comm comm) { MPI_Offset offset = 0, nbytes = data.size()*sizeof(T); MPI_Check( MPI_Exscan(&nbytes, &offset, 1, MPI_OFFSET, MPI_SUM, comm)); MPI_Check( MPI_File_write_at_all(f, base + offset, data.data(), nbytes, MPI_CHAR, MPI_STATUS_IGNORE)); MPI_Offset ntotal = 0; MPI_Check( MPI_Allreduce(&nbytes, &ntotal, 1, MPI_OFFSET, MPI_SUM, comm) ); return ntotal; }
static void armci_gather_hostnames(char **hostname_arr) { int i, j, k, namelen, is_master; char hostname[MPI_MAX_PROCESSOR_NAME], *hostnames=NULL; int *master_arr=NULL; master_arr = (int*) malloc(armci_nproc * sizeof(int)); hostnames = (char*) malloc(armci_nproc * MPI_MAX_PROCESSOR_NAME * sizeof(char)); if(hostnames==NULL || master_arr==NULL) { armci_die("armci_gather_hostnames: malloc failed.", 0); } MPI_Get_processor_name(hostname, &namelen); MPI_Check( MPI_Allgather(hostname, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD) ); if(armci_me == armci_master) { is_master = 1; } else { is_master = 0; } MPI_Check(MPI_Allgather(&is_master, 1, MPI_INT, master_arr, 1, MPI_INT, MPI_COMM_WORLD)); { /* get only the hostname of armci master processes */ for(i=0,j=0,k=0; i<armci_nproc; i++) { if(master_arr[i] == 1) { if(j>=armci_nserver) armci_die("armci_gather_hostnames: Invalid masters.",0); strncpy(hostname_arr[j++], &hostnames[k], MPI_MAX_PROCESSOR_NAME); } k += MPI_MAX_PROCESSOR_NAME; } } free(hostnames); free(master_arr); }
/* NOTE: armci_mpi_strided and armci_mpi_strided2 are the only 2 functions * that are common to client and server part */ void armci_mpi_strided_c2s(int op, void *ptr, int stride_levels, int stride_arr[], int count[], int proc, MPI_Comm comm) { int i, j; long idx; /* index offset of current block position to ptr */ int n1dim; /* number of 1 dim block */ int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL]; MPI_Status status; /* number of n-element of the first dimension */ n1dim = 1; for(i=1; i<=stride_levels; i++) n1dim *= count[i]; /* calculate the destination indices */ bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1; for(i=2; i<=stride_levels; i++) { bvalue[i] = 0; bunit[i] = bunit[i-1] * count[i-1]; } for(i=0; i<n1dim; i++) { idx = 0; for(j=1; j<=stride_levels; j++) { idx += bvalue[j] * stride_arr[j-1]; if((i+1) % bunit[j] == 0) bvalue[j]++; if(bvalue[j] > (count[j]-1)) bvalue[j] = 0; } if(op == SEND) { MPI_Check( MPI_Send(((char*)ptr)+idx, count[0], MPI_BYTE, proc, ARMCI_MPI_CLIENT2SERVER_TAG, comm) ); } else /* ( op == RECV) */ { MPI_Check( MPI_Recv(((char*)ptr)+idx, count[0], MPI_BYTE, proc, ARMCI_MPI_SERVER2CLIENT_TAG, comm, &status) ); } } }
/*\ client receives data from server \*/ char *armci_ReadFromDirect (int proc, request_header_t *msginfo, int len) { int server; int clus_id = armci_clus_id(proc); MPI_Status status; server = armci_clus_info[clus_id].master; armci_mpi2_debug(armci_me, "armci_ReadFromDirect: proc=%d, server=%d, " "msginfo=%p, bytes=%d (op=%d)\n", proc, server, msginfo, len, msginfo->operation); MPI_Check( MPI_Recv(msginfo + 1, len, MPI_BYTE, server, ARMCI_MPI_SERVER2CLIENT_TAG, ARMCI_COMM_WORLD, &status) ); armci_mpi2_debug(armci_me, "recv msg from server(%d), fwd by client %d\n", server, proc); { int count; MPI_Get_count(&status, MPI_BYTE, &count); if (count != len) { armci_mpi2_debug(armci_me, "armci_ReadFromDirect: got %d bytes, " "expected %d bytes\n", count, len); armci_die("armci_ReadFromDirect: MPI_Recv failed.", count); } } return (char *) (msginfo+1); }
/** * Create server processes. This is called in armci_start_server. * Must be called after armci_init_clusinfo(). */ void armci_create_server_MPIprocess () { int rank, size, flag, i; MPI_Check(MPI_Initialized(&flag)); if (flag == 0) armci_die("ARMCI error: MPI_Init must be called before PARMCI_Init()",0); MPI_Check(MPI_Comm_rank(ARMCI_COMM_WORLD, &rank)); MPI_Check(MPI_Comm_size(ARMCI_COMM_WORLD, &size)); armci_nserver = armci_nclus; /* makesure all processes sync here. CHECK: does it ensure global sync ? */ MPI_Check(MPI_Barrier(ARMCI_COMM_WORLD)); armci_mpi2_debug(0, "armci_create_server_MPIprocess: Servers spawned!\n"); }
/* Create connections between clients and servers */ void armci_init_connections() { armci_mpi2_debug(0, "armci_init_connections\n"); _armci_buf_init(); /* CHECK: Is this correct ? */ MPI_Check(MPI_Barrier(ARMCI_COMM_WORLD)); /* Abhinav Vishnu */ armci_create_server_MPIprocess(); armci_mpi2_debug(0, "armci_init_connections completed\n"); }
/*\ client sends request message to server \*/ int armci_send_req_msg (int proc, void *buf, int bytes) { int server = armci_clus_id(proc); armci_mpi2_debug(armci_me, "armci_send_req_msg(): proc=%d, server=%d, " "buf=%p, bytes=%d\n", proc, server, buf, bytes); if( !(server >= 0 && server < armci_nserver) ) armci_die("armci_send_req_msg: Invalid server.", 0); #ifdef MULTIPLE_BUFS /** * Sequentially ordered tags to ensure flow control at the server side. * For example, a put followed by get from a client should be processed in * ORDER at the server side. If we don't have the flow control, the server * might process the get request first instead of put (and thus violating * ARMCI's ordering semantics. */ ((request_header_t*)buf)->tag = _armci_mpi_tag[server]; MPI_Check( MPI_Send(buf, bytes, MPI_BYTE, server, ARMCI_MPI_SPAWN_TAG, MPI_COMM_CLIENT2SERVER) ); _armci_mpi_tag[server]++; if(_armci_mpi_tag[server] > ARMCI_MPI_SPAWN_TAG_END) _armci_mpi_tag[server] = ARMCI_MPI_SPAWN_TAG_BEGIN; #else MPI_Check( MPI_Send(buf, bytes, MPI_BYTE, server, ARMCI_MPI_SPAWN_TAG, MPI_COMM_CLIENT2SERVER) ); #endif armci_mpi2_debug(armci_me, "armci_send_req_msg(): send msg to server(%d), to" "fwd to client %d\n", server, proc); return 0; }
/* server sends data back to client */ void armci_WriteToDirect (int to, request_header_t *msginfo, void *data) { armci_mpi2_server_debug(armci_server_me, "armci_WriteToDirect: " "to=%d, msginfo=%p, data=%p, bytes=%d\n", to, msginfo, data, msginfo->datalen); if( !(to >= 0 && to < armci_nproc) ) armci_die("armci_WriteToDirect: send request to invalid client", to); MPI_Check( MPI_Send(data, msginfo->datalen, MPI_BYTE, to, ARMCI_MPI_SPAWN_TAG, MPI_COMM_SERVER2CLIENT) ); }
void YmrObject::createCheckpointSymlink(MPI_Comm comm, std::string path, std::string identifier, std::string extension) const { int rank; MPI_Check( MPI_Comm_rank(comm, &rank) ); if (rank == 0) { std::string lnname = createCheckpointName (path, identifier, extension); std::string fname = createCheckpointNameWithId(path, identifier, extension); std::string command = "ln -f " + fname + " " + lnname; if ( system(command.c_str()) != 0 ) error("Could not create symlink '%s' for checkpoint file '%s'", lnname.c_str(), fname.c_str()); } }
/*\ client sends request message to server \*/ int armci_send_req_msg (int proc, void *buf, int bytes) { int clus_id = armci_clus_id(proc); int server ; /* Abhinav Vishnu */ server = armci_clus_info[clus_id].master; armci_mpi2_debug(armci_me, "armci_send_req_msg(): proc=%d, server=%d, " "buf=%p, bytes=%d\n", proc, server, buf, bytes); MPI_Check( MPI_Send(buf, bytes, MPI_BYTE, server, ARMCI_MPI_CLIENT2SERVER_TAG, ARMCI_COMM_WORLD) ); armci_mpi2_debug(armci_me, "armci_send_req_msg(): send msg to server(%d), to" "fwd to client %d\n", server, proc); return 0; }
void ParticleDumperPlugin::handshake() { auto req = waitData(); MPI_Check( MPI_Wait(&req, MPI_STATUS_IGNORE) ); recv(); std::vector<int> sizes; std::vector<std::string> names; SimpleSerializer::deserialize(data, sizes, names); auto init_channel = [] (XDMF::Channel::DataForm dataForm, int sz, const std::string& str, XDMF::Channel::NumberType numberType = XDMF::Channel::NumberType::Float, TypeDescriptor datatype = DataTypeWrapper<float>()) { return XDMF::Channel(str, nullptr, dataForm, numberType, datatype); }; // Velocity and id are special channels which are always present std::string allNames = "velocity, id"; channels.push_back(init_channel(XDMF::Channel::DataForm::Vector, 3, "velocity", XDMF::Channel::NumberType::Float, DataTypeWrapper<float>())); channels.push_back(init_channel(XDMF::Channel::DataForm::Scalar, 1, "id", XDMF::Channel::NumberType::Int64, DataTypeWrapper<int64_t>())); for (int i = 0; i<sizes.size(); i++) { allNames += ", " + names[i]; switch (sizes[i]) { case 1: channels.push_back(init_channel(XDMF::Channel::DataForm::Scalar, sizes[i], names[i])); break; case 3: channels.push_back(init_channel(XDMF::Channel::DataForm::Vector, sizes[i], names[i])); break; case 6: channels.push_back(init_channel(XDMF::Channel::DataForm::Tensor6, sizes[i], names[i])); break; default: die("Plugin '%s' got %d as a channel '%s' size, expected 1, 3 or 6", name.c_str(), sizes[i], names[i].c_str()); } } // Create the required folder createFoldersCollective(comm, parentPath(path)); debug2("Plugin '%s' was set up to dump channels %s. Path is %s", name.c_str(), allNames.c_str(), path.c_str()); }
/*\ client receives data from server \*/ char *armci_ReadFromDirect (int proc, request_header_t *msginfo, int len) { int server = armci_clus_id(proc); MPI_Status status; armci_mpi2_debug(armci_me, "armci_ReadFromDirect: proc=%d, server=%d, " "msginfo=%p, bytes=%d (op=%d)\n", proc, server, msginfo, len, msginfo->operation); if( !(server >= 0 && server < armci_nserver) ) armci_die("armci_ReadFromDirect: Invalid server.", 0); MPI_Check( MPI_Recv(msginfo + 1, len, MPI_BYTE, server, ARMCI_MPI_SPAWN_TAG, MPI_COMM_CLIENT2SERVER, &status) ); armci_mpi2_debug(armci_me, "recv msg from server(%d), fwd by client %d\n", server, proc); #if MPI_SPAWN_DEBUG { int count; MPI_Get_count(&status, MPI_BYTE, &count); if (count != len) { armci_mpi2_debug(armci_me, "armci_ReadFromDirect: got %d bytes, " "expected %d bytes\n", count, len); armci_die("armci_ReadFromDirect: MPI_Recv failed.", count); } } #endif return (char *) (msginfo+1); }
static void writePLY( MPI_Comm comm, std::string fname, int nvertices, int nverticesPerObject, int ntriangles, int ntrianglesPerObject, int nObjects, const std::vector<int3>& mesh, const std::vector<float3>& vertices) { int rank; MPI_Check( MPI_Comm_rank(comm, &rank) ); int totalVerts = 0; MPI_Check( MPI_Reduce(&nvertices, &totalVerts, 1, MPI_INT, MPI_SUM, 0, comm) ); int totalTriangles = 0; MPI_Check( MPI_Reduce(&ntriangles, &totalTriangles, 1, MPI_INT, MPI_SUM, 0, comm) ); MPI_File f; MPI_Check( MPI_File_open(comm, fname.c_str(), MPI_MODE_CREATE|MPI_MODE_DELETE_ON_CLOSE|MPI_MODE_WRONLY, MPI_INFO_NULL, &f) ); MPI_Check( MPI_File_close(&f) ); MPI_Check( MPI_File_open(comm, fname.c_str(), MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &f) ); int headerSize = 0; MPI_Offset fileOffset = 0; if (rank == 0) { std::stringstream ss; ss << "ply\n"; ss << "format binary_little_endian 1.0\n"; ss << "element vertex " << totalVerts << "\n"; ss << "property float x\nproperty float y\nproperty float z\n"; //ss << "property float xnormal\nproperty float ynormal\nproperty float znormal\n"; ss << "element face " << totalTriangles << "\n"; ss << "property list int int vertex_index\n"; ss << "end_header\n"; std::string content = ss.str(); headerSize = content.length(); MPI_Check( MPI_File_write_at(f, fileOffset, content.c_str(), headerSize, MPI_CHAR, MPI_STATUS_IGNORE) ); } MPI_Check( MPI_Bcast(&headerSize, 1, MPI_INT, 0, comm) ); fileOffset += headerSize; fileOffset += writeToMPI(vertices, f, fileOffset, comm); int verticesOffset = 0; MPI_Check( MPI_Exscan(&nvertices, &verticesOffset, 1, MPI_INT, MPI_SUM, comm)); std::vector<int4> connectivity; for(int j = 0; j < nObjects; ++j) for(int i = 0; i < ntrianglesPerObject; ++i) { int3 vertIds = mesh[i] + nverticesPerObject * j + verticesOffset; connectivity.push_back({3, vertIds.x, vertIds.y, vertIds.z}); } fileOffset += writeToMPI(connectivity, f, fileOffset, comm); MPI_Check( MPI_File_close(&f)); }
static void armci_mpi2_spawn() { int i; char server_program[100]; char **command_arr=NULL, **hostname_arr=NULL, **nid_arr=NULL; int *size_arr=NULL; MPI_Info *info_arr; /* we need to start 1 data server process on each node. So a total of "armci_nclus" data servers */ armci_nserver = armci_nclus; select_server_program(server_program, armci_nserver); armci_mpi2_debug(0, "armci_mpi2_init(): Spawning %d data server processes " "running %s\n", armci_nserver, server_program); /* allocate necessary data structures */ { command_arr = (char**) malloc(armci_nserver * sizeof(char*)); size_arr = (int*) malloc(armci_nserver * sizeof(int)); info_arr = (MPI_Info*) malloc(armci_nserver * sizeof(MPI_Info)); hostname_arr = (char**) malloc(armci_nserver * sizeof(char*)); #ifdef SPAWN_CRAY_XT nid_arr = (char**) malloc(armci_nserver * sizeof(char*));; #endif for(i=0; i<armci_nserver; i++) { hostname_arr[i] = (char*)malloc(MPI_MAX_PROCESSOR_NAME*sizeof(char)); } if(command_arr==NULL || size_arr==NULL || info_arr==NULL || hostname_arr==NULL) { armci_die("armci_mpi2_spawn: malloc failed.", 0); } } /** * 1. root process collects hostnames (i.e. machine names) of where to * spawn dataservers. ARMCI masters of respective node will return their * hostnames. */ armci_gather_hostnames(hostname_arr); /** 2. initialize MPI_Comm_spawn_multiple() arguments */ { for(i=0; i<armci_nserver; i++) { command_arr[i] = (*_armci_argv)[0]; /*CHECK: path needs fix */ size_arr[i] = 1; /* 1 data server in each node */ MPI_Info_create(&info_arr[i]); #ifdef SPAWN_CRAY_XT asprintf(&nid_arr[i], "%d", atoi((hostname_arr[i] + 3))); MPI_Info_set(info_arr[i], "host", nid_arr[i]); /*portability? */ #else MPI_Info_set(info_arr[i], "host", hostname_arr[i]); /*portability? */ #endif } } /** * 3. MPI_Comm_spawn_multiple(): This is a collective call. * Intercommunicator "ds_intercomm" contains only new dataserver processes. */ MPI_Check( MPI_Comm_spawn_multiple(armci_nserver, command_arr, MPI_ARGVS_NULL, size_arr, info_arr, ARMCI_ROOT, MPI_COMM_WORLD, &MPI_COMM_CLIENT2SERVER, MPI_ERRCODES_IGNORE) ); { for(i=0; i<armci_nserver; i++) free(hostname_arr[i]); free(command_arr); free(size_arr); free(info_arr); free(hostname_arr); #ifdef SPAWN_CRAY_XT free(nid_arr); #endif } }
void armci_call_data_server() { int p=-1; MPI_Status status; armci_mpi2_server_debug(0, "armci_call_data_server(): Server main loop\n"); #if !defined(MULTIPLE_BUFS) /* server main loop; wait for and service requests until QUIT requested */ for(;;) { MPI_Check( MPI_Probe(MPI_ANY_SOURCE, ARMCI_MPI_SPAWN_TAG,MPI_COMM_SERVER2CLIENT, &status) ); p = status.MPI_SOURCE; armci_mpi2_server_debug(armci_server_me, "Processing message from client %d\n", p); armci_data_server(&p); } #else int i, tag, reqid, do_waitlist=0; /* server multiple bufs setup */ _req_waitlist_head = NULL; _req_waitlist_tail = NULL; /* Initialize "next tag" array, which manages flow control */ if( (_next_tag = (int*) malloc(armci_nproc*sizeof(int)) ) == NULL) armci_die("mpi2_server: _next_tag malloc failed", 0); for(i=0; i<armci_nproc; i++) _next_tag[i] = ARMCI_MPI_SPAWN_TAG_BEGIN; /* server posts multiple receive buffers in advance */ for(i=0; i<MPI2_MAX_BUFS; i++) { MPI_Check( MPI_Irecv(_mpi2_rcv_buf[i], MSG_BUFLEN, MPI_BYTE, MPI_ANY_SOURCE, ARMCI_MPI_SPAWN_TAG, MPI_COMM_SERVER2CLIENT, &_mpi_request[i]) ); } for(;;) { /* process wait-listed requests, if any */ do_waitlist = 0; if(_req_waitlist_head != NULL) { do_waitlist = wlist_get_req(&p, &tag, &reqid); } if(!do_waitlist) { /* process the first completed incoming request */ MPI_Check( MPI_Waitany(MPI2_MAX_BUFS, _mpi_request, &reqid, &status) ); p = status.MPI_SOURCE; /* tag = status.MPI_TAG; */ tag = ((request_header_t*) _mpi2_rcv_buf[reqid])->tag; /* check if it is in or out of order request */ if(tag == _next_tag[p]) { INCR_TAG(p); } else { /* out of order req - enforce ordering by waitlisting this req */ wlist_add_req(reqid, p, tag); continue; } } /* mark the request id that is ready to processed */ _reqid_ready = reqid; /* server process the incoming (or waitlisted) request */ armci_data_server(&p); /* After completing the request (which also frees a buffer), server * posts a receive using this buffer */ MPI_Check( MPI_Irecv(_mpi2_rcv_buf[reqid], MSG_BUFLEN, MPI_BYTE, MPI_ANY_SOURCE, ARMCI_MPI_SPAWN_TAG, MPI_COMM_SERVER2CLIENT, &_mpi_request[reqid]) ); } #endif }
/* server receives request */ void armci_rcv_req (void *mesg, void *phdr, void *pdescr, void *pdata, int *buflen) { request_header_t *msginfo = NULL; int hdrlen = sizeof(request_header_t); int p=-1; int bytes; #if !defined(MULTIPLE_BUFS) MPI_Status status; msginfo = (request_header_t*) MessageRcvBuffer; p = * (int *) mesg; MPI_Check( MPI_Recv(MessageRcvBuffer, MSG_BUFLEN, MPI_BYTE, p, ARMCI_MPI_SPAWN_TAG, MPI_COMM_SERVER2CLIENT, &status) ); #else int reqid = _reqid_ready;;/*get request id that is ready to be processed */ msginfo = (request_header_t*) _mpi2_rcv_buf[reqid]; p = * (int *) mesg; if(p != msginfo->from) armci_die("armci_rcv_req: invalid client", p); #endif * (void **) phdr = msginfo; if( !(p >= 0 && p < armci_nproc) ) armci_die("armci_rcv_req: request from invalid client", p); armci_mpi2_server_debug(armci_server_me, "armci_rcv_req: op=%d mesg=%p, phdr=%p " "pdata=%p, buflen=%p, p=%d\n", msginfo->operation, mesg, phdr, pdata, buflen, p, MSG_BUFLEN); #ifdef MPI_SPAWN_ZEROCOPY if(msginfo->operation==PUT && msginfo->datalen==0) { if(msginfo->format==STRIDED) { armci_mpi_rcv_strided_data(msginfo, pdescr, p); } if(msginfo->format==VECTOR) { armci_mpi_rcv_vector_data(msginfo, pdescr, p); } return; } #endif *buflen = MSG_BUFLEN - hdrlen; if (msginfo->operation == GET) { bytes = msginfo->dscrlen; } else { bytes = msginfo->bytes; if (bytes > *buflen) armci_die2("armci_rcv_req: message overflowing rcv buf", msginfo->bytes, *buflen); } #if MPI_SPAWN_DEBUG && !defined(MPI_SPAWN_ZEROCOPY) && 0 { int count; MPI_Get_count(&status, MPI_BYTE, &count); if (count != (bytes + hdrlen)) { armci_mpi2_server_debug(armci_server_me, "armci_rcv_req: " "got %d bytes, expected %d bytes\n", count, bytes + hdrlen); printf("%d: armci_rcv_req: got %d bytes, expected %d bytes (%d)\n", armci_me, count, bytes + hdrlen, msginfo->datalen); armci_die("armci_rcv_req: count check failed.\n", 0); } } #endif if (msginfo->bytes) { * (void **) pdescr = msginfo + 1; * (void **) pdata = msginfo->dscrlen + (char *) (msginfo+1); *buflen -= msginfo->dscrlen; if (msginfo->operation != GET && msginfo->datalen) { *buflen -= msginfo->datalen; } } else { * (void**) pdata = msginfo + 1; * (void**) pdescr = NULL; } if (msginfo->datalen > 0 && msginfo->operation != GET) { if (msginfo->datalen > (MSG_BUFLEN - hdrlen - msginfo->dscrlen)) { armci_die2("armci_rcv_req:data overflowing buffer", msginfo->dscrlen, msginfo->datalen); } *buflen -= msginfo->datalen; } }