/*\ client sends strided data + request to server \*/ int armci_send_req_msg_strided(int proc, request_header_t *msginfo,char *ptr, int strides, int stride_arr[], int count[]) { int server; int clus_id = armci_clus_id(proc); int bytes; /* Abhinav Vishnu */ server = armci_clus_info[clus_id].master; armci_mpi2_debug(armci_me, "armci_send_req_msg_strided: proc=%d server=%d " "bytes=%d (op=%d)\n", proc, server, msginfo->datalen, msginfo->operation); /* we write header + descriptor of strided data */ bytes = sizeof(request_header_t) + msginfo->dscrlen; armci_send_req_msg(proc, msginfo, bytes); { /* for larger blocks write directly thus avoiding memcopy */ armci_mpi_strided_c2s(SEND, ptr, strides, stride_arr, count, server, ARMCI_COMM_WORLD); } armci_mpi2_debug(armci_me, "armci_send_req_msg_strided(): send msg to " "server(%d), to fwd to client %d\n", server, proc); return 0; }
/*\ client receives data from server \*/ char *armci_ReadFromDirect (int proc, request_header_t *msginfo, int len) { int server; int clus_id = armci_clus_id(proc); MPI_Status status; server = armci_clus_info[clus_id].master; armci_mpi2_debug(armci_me, "armci_ReadFromDirect: proc=%d, server=%d, " "msginfo=%p, bytes=%d (op=%d)\n", proc, server, msginfo, len, msginfo->operation); MPI_Check( MPI_Recv(msginfo + 1, len, MPI_BYTE, server, ARMCI_MPI_SERVER2CLIENT_TAG, ARMCI_COMM_WORLD, &status) ); armci_mpi2_debug(armci_me, "recv msg from server(%d), fwd by client %d\n", server, proc); { int count; MPI_Get_count(&status, MPI_BYTE, &count); if (count != len) { armci_mpi2_debug(armci_me, "armci_ReadFromDirect: got %d bytes, " "expected %d bytes\n", count, len); armci_die("armci_ReadFromDirect: MPI_Recv failed.", count); } } return (char *) (msginfo+1); }
/* Create connections between clients and servers */ void armci_init_connections() { armci_mpi2_debug(0, "armci_init_connections\n"); _armci_buf_init(); /* CHECK: Is this correct ? */ MPI_Check(MPI_Barrier(ARMCI_COMM_WORLD)); /* Abhinav Vishnu */ armci_create_server_MPIprocess(); armci_mpi2_debug(0, "armci_init_connections completed\n"); }
/*\ client receives strided data from server \*/ void armci_ReadStridedFromDirect(int proc, request_header_t* msginfo, void *ptr, int strides, int stride_arr[], int count[]) { int server=armci_clus_id(proc); armci_mpi2_debug(armci_me, "armci_ReadStridedFromDirect: proc=%d " "stride_levels=%d, server=%d bytes=%d (op=%d)\n", proc, strides, server, msginfo->datalen, msginfo->operation); if( !(server >= 0 && server < armci_nserver) ) armci_die("armci_ReadStridedFromDirect: Invalid server.", 0); #ifdef MPI_USER_DEF_DATATYPE if(strides > 0) { armci_mpi_strided2(RECV, ptr, strides, stride_arr, count, server, MPI_COMM_CLIENT2SERVER); } else #endif { armci_mpi_strided(RECV, ptr, strides, stride_arr, count, server, MPI_COMM_CLIENT2SERVER); } }
void armci_wait_for_server() { armci_mpi2_debug(0, "armci_wait_for_server: wait for server to quit\n"); if (armci_me == armci_master) { armci_serv_quit(); } }
static inline int MPI_Check (int status) { if(status != MPI_SUCCESS) { armci_mpi2_debug(armci_me, "MPI Check failed.\n"); armci_die("MPI_Check failed.", 0); } }
/*\ client sends request message to server \*/ int armci_send_req_msg (int proc, void *buf, int bytes) { int clus_id = armci_clus_id(proc); int server ; /* Abhinav Vishnu */ server = armci_clus_info[clus_id].master; armci_mpi2_debug(armci_me, "armci_send_req_msg(): proc=%d, server=%d, " "buf=%p, bytes=%d\n", proc, server, buf, bytes); MPI_Check( MPI_Send(buf, bytes, MPI_BYTE, server, ARMCI_MPI_CLIENT2SERVER_TAG, ARMCI_COMM_WORLD) ); armci_mpi2_debug(armci_me, "armci_send_req_msg(): send msg to server(%d), to" "fwd to client %d\n", server, proc); return 0; }
/*\ client sends request message to server \*/ int armci_send_req_msg (int proc, void *buf, int bytes) { int server = armci_clus_id(proc); armci_mpi2_debug(armci_me, "armci_send_req_msg(): proc=%d, server=%d, " "buf=%p, bytes=%d\n", proc, server, buf, bytes); if( !(server >= 0 && server < armci_nserver) ) armci_die("armci_send_req_msg: Invalid server.", 0); #ifdef MULTIPLE_BUFS /** * Sequentially ordered tags to ensure flow control at the server side. * For example, a put followed by get from a client should be processed in * ORDER at the server side. If we don't have the flow control, the server * might process the get request first instead of put (and thus violating * ARMCI's ordering semantics. */ ((request_header_t*)buf)->tag = _armci_mpi_tag[server]; MPI_Check( MPI_Send(buf, bytes, MPI_BYTE, server, ARMCI_MPI_SPAWN_TAG, MPI_COMM_CLIENT2SERVER) ); _armci_mpi_tag[server]++; if(_armci_mpi_tag[server] > ARMCI_MPI_SPAWN_TAG_END) _armci_mpi_tag[server] = ARMCI_MPI_SPAWN_TAG_BEGIN; #else MPI_Check( MPI_Send(buf, bytes, MPI_BYTE, server, ARMCI_MPI_SPAWN_TAG, MPI_COMM_CLIENT2SERVER) ); #endif armci_mpi2_debug(armci_me, "armci_send_req_msg(): send msg to server(%d), to" "fwd to client %d\n", server, proc); return 0; }
/*\ client receives data from server \*/ char *armci_ReadFromDirect (int proc, request_header_t *msginfo, int len) { int server = armci_clus_id(proc); MPI_Status status; armci_mpi2_debug(armci_me, "armci_ReadFromDirect: proc=%d, server=%d, " "msginfo=%p, bytes=%d (op=%d)\n", proc, server, msginfo, len, msginfo->operation); if( !(server >= 0 && server < armci_nserver) ) armci_die("armci_ReadFromDirect: Invalid server.", 0); MPI_Check( MPI_Recv(msginfo + 1, len, MPI_BYTE, server, ARMCI_MPI_SPAWN_TAG, MPI_COMM_CLIENT2SERVER, &status) ); armci_mpi2_debug(armci_me, "recv msg from server(%d), fwd by client %d\n", server, proc); #if MPI_SPAWN_DEBUG { int count; MPI_Get_count(&status, MPI_BYTE, &count); if (count != len) { armci_mpi2_debug(armci_me, "armci_ReadFromDirect: got %d bytes, " "expected %d bytes\n", count, len); armci_die("armci_ReadFromDirect: MPI_Recv failed.", count); } } #endif return (char *) (msginfo+1); }
/*\ client sends strided data + request to server \*/ int armci_send_req_msg_strided(int proc, request_header_t *msginfo,char *ptr, int strides, int stride_arr[], int count[]) { int server = armci_clus_id(proc); int bytes; armci_mpi2_debug(armci_me, "armci_send_req_msg_strided: proc=%d server=%d " "bytes=%d (op=%d)\n", proc, server, msginfo->datalen, msginfo->operation); THREAD_LOCK(armci_user_threads.net_lock); /* we write header + descriptor of strided data */ bytes = sizeof(request_header_t) + msginfo->dscrlen; armci_send_req_msg(proc, msginfo, bytes); #ifdef MPI_USER_DEF_DATATYPE if(strides>0) { armci_mpi_strided2(SEND, ptr, strides, stride_arr, count, server, MPI_COMM_CLIENT2SERVER); } else #endif { /* for larger blocks write directly thus avoiding memcopy */ armci_mpi_strided(SEND, ptr, strides, stride_arr, count, server, MPI_COMM_CLIENT2SERVER); } THREAD_UNLOCK(armci_user_threads.net_lock); armci_mpi2_debug(armci_me, "armci_send_req_msg_strided(): send msg to " "server(%d), to fwd to client %d\n", server, proc); return 0; }
/** * Create server processes. This is called in armci_start_server. * Must be called after armci_init_clusinfo(). */ void armci_create_server_MPIprocess () { int rank, size, flag, i; MPI_Check(MPI_Initialized(&flag)); if (flag == 0) armci_die("ARMCI error: MPI_Init must be called before PARMCI_Init()",0); MPI_Check(MPI_Comm_rank(ARMCI_COMM_WORLD, &rank)); MPI_Check(MPI_Comm_size(ARMCI_COMM_WORLD, &size)); armci_nserver = armci_nclus; /* makesure all processes sync here. CHECK: does it ensure global sync ? */ MPI_Check(MPI_Barrier(ARMCI_COMM_WORLD)); armci_mpi2_debug(0, "armci_create_server_MPIprocess: Servers spawned!\n"); }
/*\ client receives strided data from server \*/ void armci_ReadStridedFromDirect(int proc, request_header_t* msginfo, void *ptr, int strides, int stride_arr[], int count[]) { int server; int clus_id = armci_clus_id(proc); /* Abhinav Vishnu */ server = armci_clus_info[clus_id].master; armci_mpi2_debug(armci_me, "armci_ReadStridedFromDirect: proc=%d " "stride_levels=%d, server=%d bytes=%d (op=%d)\n", proc, strides, server, msginfo->datalen, msginfo->operation); { armci_mpi_strided_c2s(RECV, ptr, strides, stride_arr, count, server, ARMCI_COMM_WORLD); } }
void armci_client_connect_to_servers() { armci_mpi2_debug(0, "armci_client_connect_to_servers\n"); }
/* Create connections between clients and servers */ void armci_init_connections() { armci_mpi2_debug(0, "armci_init_connections\n"); _armci_buf_init(); /* CHECK: Is this correct ? */ }
/** * Create server processes. This is called in armci_start_server. * Must be called after armci_init_clusinfo(). */ void armci_create_server_MPIprocess () { int rank, size, flag, i; MPI_Initialized(&flag); if (flag == 0) armci_die("ARMCI error: MPI_Init must be called before PARMCI_Init()",0); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); /* spawn one data server process (i.e. additional MPI proc) on each node */ armci_mpi2_spawn(); /** * Armci masters send the following info to their corresponding server as * the server was not part of the initialization step in PARMCI_Init() * 1. cluster info ( i.e. armci_init_clusinfo() ) * 2. lock info ( i.e.armci_allocate_locks() ) */ if(armci_me == armci_master) { int msg[3]; long shm_info[3], shmoffset; int shmid; size_t shmsize; /** * 1. Cluster info */ msg[0] = ARMCI_MPI_SPAWN_INIT_TAG + armci_clus_me; /* for validation */ msg[1] = armci_me; msg[2] = armci_clus_info[armci_clus_me].nslave; MPI_Send(msg, 3, MPI_INT, armci_clus_me, ARMCI_MPI_SPAWN_INIT_TAG, MPI_COMM_CLIENT2SERVER); /* send the entire clus info to its data server */ MPI_Send(armci_clus_info, armci_nclus*sizeof(armci_clus_t), MPI_BYTE, armci_clus_me, ARMCI_MPI_SPAWN_INIT_TAG, MPI_COMM_CLIENT2SERVER); /** * 2. lock info */ armci_get_shmem_info((char*)_armci_int_mutexes, &shmid, &shmoffset, &shmsize); shm_info[0] = (long) shmid; shm_info[1] = (long) shmoffset; shm_info[2] = (long) shmsize; MPI_Send(shm_info, 3, MPI_LONG, armci_clus_me, ARMCI_MPI_SPAWN_INIT_TAG, MPI_COMM_CLIENT2SERVER); } /* initialize tags for flow control */ _armci_mpi_tag = (int*) malloc(armci_nserver*sizeof(int)); for(i=0; i<armci_nserver; i++) _armci_mpi_tag[i]=ARMCI_MPI_SPAWN_TAG_BEGIN; /* makesure all processes sync here. CHECK: does it ensure global sync ? */ MPI_Barrier(MPI_COMM_WORLD); armci_mpi2_debug(0, "armci_create_server_MPIprocess: Servers spawned!\n"); }
static void armci_mpi2_spawn() { int i; char server_program[100]; char **command_arr=NULL, **hostname_arr=NULL, **nid_arr=NULL; int *size_arr=NULL; MPI_Info *info_arr; /* we need to start 1 data server process on each node. So a total of "armci_nclus" data servers */ armci_nserver = armci_nclus; select_server_program(server_program, armci_nserver); armci_mpi2_debug(0, "armci_mpi2_init(): Spawning %d data server processes " "running %s\n", armci_nserver, server_program); /* allocate necessary data structures */ { command_arr = (char**) malloc(armci_nserver * sizeof(char*)); size_arr = (int*) malloc(armci_nserver * sizeof(int)); info_arr = (MPI_Info*) malloc(armci_nserver * sizeof(MPI_Info)); hostname_arr = (char**) malloc(armci_nserver * sizeof(char*)); #ifdef SPAWN_CRAY_XT nid_arr = (char**) malloc(armci_nserver * sizeof(char*));; #endif for(i=0; i<armci_nserver; i++) { hostname_arr[i] = (char*)malloc(MPI_MAX_PROCESSOR_NAME*sizeof(char)); } if(command_arr==NULL || size_arr==NULL || info_arr==NULL || hostname_arr==NULL) { armci_die("armci_mpi2_spawn: malloc failed.", 0); } } /** * 1. root process collects hostnames (i.e. machine names) of where to * spawn dataservers. ARMCI masters of respective node will return their * hostnames. */ armci_gather_hostnames(hostname_arr); /** 2. initialize MPI_Comm_spawn_multiple() arguments */ { for(i=0; i<armci_nserver; i++) { command_arr[i] = (*_armci_argv)[0]; /*CHECK: path needs fix */ size_arr[i] = 1; /* 1 data server in each node */ MPI_Info_create(&info_arr[i]); #ifdef SPAWN_CRAY_XT asprintf(&nid_arr[i], "%d", atoi((hostname_arr[i] + 3))); MPI_Info_set(info_arr[i], "host", nid_arr[i]); /*portability? */ #else MPI_Info_set(info_arr[i], "host", hostname_arr[i]); /*portability? */ #endif } } /** * 3. MPI_Comm_spawn_multiple(): This is a collective call. * Intercommunicator "ds_intercomm" contains only new dataserver processes. */ MPI_Check( MPI_Comm_spawn_multiple(armci_nserver, command_arr, MPI_ARGVS_NULL, size_arr, info_arr, ARMCI_ROOT, MPI_COMM_WORLD, &MPI_COMM_CLIENT2SERVER, MPI_ERRCODES_IGNORE) ); { for(i=0; i<armci_nserver; i++) free(hostname_arr[i]); free(command_arr); free(size_arr); free(info_arr); free(hostname_arr); #ifdef SPAWN_CRAY_XT free(nid_arr); #endif } }