bool init_param_from_mpi(dc_init_param& param,dc_comm_type commtype) { #ifdef HAS_MPI ASSERT_MSG(commtype == TCP_COMM, "MPI initialization only supports TCP at the moment"); // Look for a free port to use. std::pair<size_t, int> port_and_sock = get_free_tcp_port(); size_t port = port_and_sock.first; int sock = port_and_sock.second; std::string ipaddr = get_local_ip_as_str(mpi_tools::rank() == 0 /* print stuff only if I am master */); ipaddr = ipaddr + ":" + tostr(port); // now do an allgather logstream(LOG_INFO) << "Will Listen on: " << ipaddr << std::endl; std::vector<std::string> machines; mpi_tools::all_gather(ipaddr, param.machines); // set defaults param.curmachineid = (procid_t)(mpi_tools::rank()); param.numhandlerthreads = RPC_DEFAULT_NUMHANDLERTHREADS; param.commtype = commtype; param.initstring = param.initstring + std::string(" __sockhandle__=") + tostr(sock) + " "; return true; #else std::cerr << "MPI Support not compiled!" << std::endl; exit(0); #endif }
bool init_param_from_zookeeper(dc_init_param& param) { #ifdef HAS_ZOOKEEPER char* zk_hosts = getenv("ZK_SERVERS"); char* zk_jobname = getenv("ZK_JOBNAME"); char* zk_numnodes = getenv("ZK_NUMNODES"); if (zk_hosts == NULL || zk_jobname == NULL || zk_numnodes == NULL) { return false; } std::vector<std::string> zk_hosts_list = strsplit(zk_hosts, ","); // number of nodes to wait for size_t numnodes = atoi(zk_numnodes); ASSERT_GE(numnodes, 1); logstream(LOG_EMPH) << "Using Zookeeper for Initialization. Waiting for " << numnodes << " to join" << std::endl; // generate a unique identifier for this server std::pair<size_t, int> port_and_sock = get_free_tcp_port(); size_t port = port_and_sock.first; int sock = port_and_sock.second; std::string ipaddr = get_local_ip_as_str(true); ipaddr = ipaddr + ":" + tostr(port); logstream(LOG_INFO) << "Will Listen on: " << ipaddr << std::endl; // get an ip address zookeeper_util::server_list server_list(zk_hosts_list, zk_jobname, ipaddr); // final server list goes here std::vector<std::string> received_servers; // locks to product the final server list mutex lock; conditional cond; // construct the watch to watch for changes on zookeeper server_list.set_callback(boost::bind(zk_callback, _1, _2, _3, boost::ref(received_servers), numnodes, boost::ref(lock), boost::ref(cond))); server_list.join("graphlab"); lock.lock(); received_servers = server_list.watch_changes("graphlab"); // wait until I get all the servers // TODO: add a timeout while(received_servers.size() < numnodes) cond.wait(lock); lock.unlock(); // done! // now make sure that everyone sees the server list in the same order ASSERT_EQ(received_servers.size(), numnodes); std::sort(received_servers.begin(), received_servers.end()); // now fill the parameter list param.machines = received_servers; param.curmachineid = std::find(received_servers.begin(), received_servers.end(), ipaddr) - received_servers.begin(); ASSERT_LT(param.curmachineid, received_servers.size()); param.numhandlerthreads = RPC_DEFAULT_NUMHANDLERTHREADS; param.commtype = RPC_DEFAULT_COMMTYPE; param.initstring = param.initstring + std::string(" __sockhandle__=") + tostr(sock) + " "; // detach from the server list // now, this takes advantage of the Zookeeper feature that // every machine sees all changes in the same order. // i.e. At some point, everyone would have seen a complete server list. // Once that happens, everyone can leave. server_list.set_callback(NULL); server_list.leave("graphlab"); return true; #else std::cerr << "Zookeeper Support not compiled!" << std::endl; exit(0); #endif }