int mca_btl_tcp_component_open(void) { char* message; #ifdef __WINDOWS__ WSADATA win_sock_data; if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* register TCP component parameters */ mca_btl_tcp_component.tcp_num_links = mca_btl_tcp_param_register_int("links", NULL, 1); mca_btl_tcp_component.tcp_if_include = mca_btl_tcp_param_register_string("if_include", "Comma-delimited list of devices or CIDR notation of networks to use for MPI communication (e.g., \"eth0,eth1\" or \"192.168.0.0/16,10.1.4.0/24\"). Mutually exclusive with btl_tcp_if_exclude.", ""); mca_btl_tcp_component.tcp_if_exclude = mca_btl_tcp_param_register_string("if_exclude", "Comma-delimited list of devices or CIDR notation of networks to NOT use for MPI communication -- all devices not matching these specifications will be used (e.g., \"eth0,eth1\" or \"192.168.0.0/16,10.1.4.0/24\"). Mutually exclusive with btl_tcp_if_include.", "lo,sppp"); mca_btl_tcp_component.tcp_free_list_num = mca_btl_tcp_param_register_int ("free_list_num", NULL, 8); mca_btl_tcp_component.tcp_free_list_max = mca_btl_tcp_param_register_int ("free_list_max", NULL, -1); mca_btl_tcp_component.tcp_free_list_inc = mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32); mca_btl_tcp_component.tcp_sndbuf = mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_rcvbuf = mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_endpoint_cache = mca_btl_tcp_param_register_int ("endpoint_cache", "The size of the internal cache for each TCP connection. This cache is" " used to reduce the number of syscalls, by replacing them with memcpy." " Every read will read the expected data plus the amount of the" " endpoint_cache", 30*1024); mca_btl_tcp_component.tcp_use_nodelay = !mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0); mca_btl_tcp_component.tcp_port_min = mca_btl_tcp_param_register_int( "port_min_v4", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v4", orte_process_info.nodename, mca_btl_tcp_component.tcp_port_min ); mca_btl_tcp_component.tcp_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 ); mca_btl_tcp_component.tcp_port_range = mca_btl_tcp_param_register_int( "port_range_v4", message, (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1); free(message); #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_port_min = mca_btl_tcp_param_register_int( "port_min_v6", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp6_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v6", orte_process_info.nodename, mca_btl_tcp_component.tcp6_port_min ); mca_btl_tcp_component.tcp6_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 ); mca_btl_tcp_component.tcp6_port_range = mca_btl_tcp_param_register_int( "port_range_v6", message, (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1); free(message); #endif mca_btl_tcp_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; mca_btl_tcp_module.super.btl_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); /* Register a list of interfaces to use in sequence */ message = mca_btl_tcp_param_register_string("if_seq", "If specified, a comma-delimited list of TCP interfaces. Interfaces will be assigned, one to each MPI process, in a round-robin fashion on each server. For example, if the list is \"eth0,eth1\" and four MPI processes are run on a single server, then local ranks 0 and 2 will use eth0 and local ranks 1 and 3 will use eth1.", NULL); mca_btl_tcp_component.tcp_if_seq = NULL; if (NULL != message && '\0' != *message) { char **argv = opal_argv_split(message, ','); if (NULL != argv && '\0' != *(argv[0])) { int if_index, rc, count; orte_node_rank_t node_rank; char name[256]; node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME); /* Now that we've got that local rank, take the corresponding entry from the tcp_if_seq list (wrapping if necessary) */ count = opal_argv_count(argv); mca_btl_tcp_component.tcp_if_seq = strdup(argv[node_rank % count]); opal_argv_free(argv); /* Double check that the selected interface actually exists */ for (if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)){ if (OPAL_SUCCESS != (rc = opal_ifindextoname(if_index, name, sizeof(name)))) { return rc; } if (0 == strcmp(name, mca_btl_tcp_component.tcp_if_seq)) { break; } } if (if_index < 0) { orte_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude", true, "if_seq", orte_process_info.nodename, mca_btl_tcp_component.tcp_if_seq, "Interface does not exist"); return OMPI_ERR_BAD_PARAM; } BTL_VERBOSE(("Node rank %d using TCP interface %s", node_rank, mca_btl_tcp_component.tcp_if_seq)); } } return OMPI_SUCCESS; }
static int mca_btl_tcp_component_register(void) { char* message; /* register TCP component parameters */ mca_btl_tcp_param_register_uint("links", NULL, 1, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_num_links); mca_btl_tcp_param_register_string("if_include", "Comma-delimited list of devices and/or CIDR notation of networks to use for MPI communication (e.g., \"eth0,192.168.0.0/16\"). Mutually exclusive with btl_tcp_if_exclude.", "", OPAL_INFO_LVL_1, &mca_btl_tcp_component.tcp_if_include); mca_btl_tcp_param_register_string("if_exclude", "Comma-delimited list of devices and/or CIDR notation of networks to NOT use for MPI communication -- all devices not matching these specifications will be used (e.g., \"eth0,192.168.0.0/16\"). If set to a non-default value, it is mutually exclusive with btl_tcp_if_include.", "127.0.0.1/8,sppp", OPAL_INFO_LVL_1, &mca_btl_tcp_component.tcp_if_exclude); mca_btl_tcp_param_register_int ("free_list_num", NULL, 8, OPAL_INFO_LVL_5, &mca_btl_tcp_component.tcp_free_list_num); mca_btl_tcp_param_register_int ("free_list_max", NULL, -1, OPAL_INFO_LVL_5, &mca_btl_tcp_component.tcp_free_list_max); mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32, OPAL_INFO_LVL_5, &mca_btl_tcp_component.tcp_free_list_inc); mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_sndbuf); mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_rcvbuf); mca_btl_tcp_param_register_int ("endpoint_cache", "The size of the internal cache for each TCP connection. This cache is" " used to reduce the number of syscalls, by replacing them with memcpy." " Every read will read the expected data plus the amount of the" " endpoint_cache", 30*1024, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_endpoint_cache); mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0, OPAL_INFO_LVL_4, &mca_btl_tcp_component.tcp_not_use_nodelay); mca_btl_tcp_param_register_int( "port_min_v4", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_port_min); asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 ); mca_btl_tcp_param_register_int( "port_range_v4", message, (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_port_range); free(message); #if OPAL_WANT_IPV6 mca_btl_tcp_param_register_int( "port_min_v6", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024, OPAL_INFO_LVL_2, & mca_btl_tcp_component.tcp6_port_min ); asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 ); mca_btl_tcp_param_register_int( "port_range_v6", message, (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp6_port_range ); free(message); #endif mca_btl_tcp_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; mca_btl_tcp_module.super.btl_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; mca_btl_tcp_module.super.btl_seg_size = sizeof (mca_btl_base_segment_t); mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); mca_btl_tcp_param_register_int ("disable_family", NULL, 0, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp_disable_family); /* Register a list of interfaces to use in sequence */ mca_btl_tcp_param_register_string("if_seq", "If specified, a comma-delimited list of TCP interfaces. Interfaces will be assigned, one to each MPI process, in a round-robin fashion on each server. For example, if the list is \"eth0,eth1\" and four MPI processes are run on a single server, then local ranks 0 and 2 will use eth0 and local ranks 1 and 3 will use eth1.", NULL, OPAL_INFO_LVL_9, &mca_btl_tcp_if_seq_string); mca_btl_tcp_component.tcp_if_seq = NULL; if (NULL != mca_btl_tcp_if_seq_string && '\0' != *mca_btl_tcp_if_seq_string) { char **argv = opal_argv_split(mca_btl_tcp_if_seq_string, ','); if (NULL != argv && '\0' != *(argv[0])) { int if_index, rc, count; ompi_node_rank_t node_rank; char name[256]; node_rank = ompi_process_info.my_node_rank; /* Now that we've got that local rank, take the corresponding entry from the tcp_if_seq list (wrapping if necessary) */ count = opal_argv_count(argv); mca_btl_tcp_component.tcp_if_seq = strdup(argv[node_rank % count]); opal_argv_free(argv); /* Double check that the selected interface actually exists */ for (if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)){ if (OPAL_SUCCESS != (rc = opal_ifindextoname(if_index, name, sizeof(name)))) { return rc; } if (0 == strcmp(name, mca_btl_tcp_component.tcp_if_seq)) { break; } } if (if_index < 0) { opal_show_help("help-mpi-btl-tcp.txt", "invalid if_inexclude", true, "if_seq", ompi_process_info.nodename, mca_btl_tcp_component.tcp_if_seq, "Interface does not exist"); free(mca_btl_tcp_component.tcp_if_seq); mca_btl_tcp_component.tcp_if_seq = NULL; } else { BTL_VERBOSE(("Node rank %d using TCP interface %s", node_rank, mca_btl_tcp_component.tcp_if_seq)); } } } return mca_btl_tcp_component_verify(); }
int mca_btl_tcp_component_open(void) { char* message; #ifdef __WINDOWS__ WSADATA win_sock_data; if( WSAStartup(MAKEWORD(2,2), &win_sock_data) != 0 ) { BTL_ERROR(("failed to initialise windows sockets:%d", WSAGetLastError())); return OMPI_ERROR; } #endif /* initialize state */ mca_btl_tcp_component.tcp_listen_sd = -1; #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_listen_sd = -1; #endif mca_btl_tcp_component.tcp_num_btls=0; mca_btl_tcp_component.tcp_addr_count = 0; mca_btl_tcp_component.tcp_btls=NULL; /* initialize objects */ OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_hash_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_eager, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_max, ompi_free_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_frag_user, ompi_free_list_t); opal_hash_table_init(&mca_btl_tcp_component.tcp_procs, 256); /* register TCP component parameters */ mca_btl_tcp_component.tcp_num_links = mca_btl_tcp_param_register_int("links", NULL, 1); mca_btl_tcp_component.tcp_if_include = mca_btl_tcp_param_register_string("if_include", NULL, ""); mca_btl_tcp_component.tcp_if_exclude = mca_btl_tcp_param_register_string("if_exclude", NULL, "lo"); mca_btl_tcp_component.tcp_free_list_num = mca_btl_tcp_param_register_int ("free_list_num", NULL, 8); mca_btl_tcp_component.tcp_free_list_max = mca_btl_tcp_param_register_int ("free_list_max", NULL, -1); mca_btl_tcp_component.tcp_free_list_inc = mca_btl_tcp_param_register_int ("free_list_inc", NULL, 32); mca_btl_tcp_component.tcp_sndbuf = mca_btl_tcp_param_register_int ("sndbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_rcvbuf = mca_btl_tcp_param_register_int ("rcvbuf", NULL, 128*1024); mca_btl_tcp_component.tcp_endpoint_cache = mca_btl_tcp_param_register_int ("endpoint_cache", "The size of the internal cache for each TCP connection. This cache is" " used to reduce the number of syscalls, by replacing them with memcpy." " Every read will read the expected data plus the amount of the" " endpoint_cache", 30*1024); mca_btl_tcp_component.tcp_use_nodelay = !mca_btl_tcp_param_register_int ("use_nagle", "Whether to use Nagle's algorithm or not (using Nagle's algorithm may increase short message latency)", 0); mca_btl_tcp_component.tcp_port_min = mca_btl_tcp_param_register_int( "port_min_v4", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v4", orte_process_info.nodename, mca_btl_tcp_component.tcp_port_min ); mca_btl_tcp_component.tcp_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1 ); mca_btl_tcp_component.tcp_port_range = mca_btl_tcp_param_register_int( "port_range_v4", message, (0x1 << 16) - mca_btl_tcp_component.tcp_port_min - 1); free(message); #if OPAL_WANT_IPV6 mca_btl_tcp_component.tcp6_port_min = mca_btl_tcp_param_register_int( "port_min_v6", "The minimum port where the TCP BTL will try to bind (default 1024)", 1024 ); if( mca_btl_tcp_component.tcp6_port_min > USHRT_MAX ) { orte_show_help("help-mpi-btl-tcp.txt", "invalid minimum port", true, "v6", orte_process_info.nodename, mca_btl_tcp_component.tcp6_port_min ); mca_btl_tcp_component.tcp6_port_min = 1024; } asprintf( &message, "The number of ports where the TCP BTL will try to bind (default %d)." " This parameter together with the port min, define a range of ports" " where Open MPI will open sockets.", (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1 ); mca_btl_tcp_component.tcp6_port_range = mca_btl_tcp_param_register_int( "port_range_v6", message, (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1); free(message); #endif mca_btl_tcp_module.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW + 100; mca_btl_tcp_module.super.btl_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_rndv_eager_limit = 64*1024; mca_btl_tcp_module.super.btl_max_send_size = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_send_length = 128*1024; mca_btl_tcp_module.super.btl_rdma_pipeline_frag_size = INT_MAX; mca_btl_tcp_module.super.btl_min_rdma_pipeline_size = 0; mca_btl_tcp_module.super.btl_flags = MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_SEND_INPLACE | MCA_BTL_FLAGS_NEED_CSUM | MCA_BTL_FLAGS_NEED_ACK | MCA_BTL_FLAGS_HETEROGENEOUS_RDMA; mca_btl_tcp_module.super.btl_bandwidth = 100; mca_btl_tcp_module.super.btl_latency = 100; mca_btl_base_param_register(&mca_btl_tcp_component.super.btl_version, &mca_btl_tcp_module.super); mca_btl_tcp_component.tcp_disable_family = mca_btl_tcp_param_register_int ("disable_family", NULL, 0); return OMPI_SUCCESS; }