Exemplo n.º 1
0
bool init_param_from_mpi(dc_init_param& param,dc_comm_type commtype) {
#ifdef HAS_MPI
    ASSERT_MSG(commtype == TCP_COMM, "MPI initialization only supports TCP at the moment");
    // Look for a free port to use.
    std::pair<size_t, int> port_and_sock = get_free_tcp_port();
    size_t port = port_and_sock.first;
    int sock = port_and_sock.second;

    std::string ipaddr =
        get_local_ip_as_str(mpi_tools::rank() == 0 /* print stuff only if I am master */);
    ipaddr = ipaddr + ":" + tostr(port);
    // now do an allgather
    logstream(LOG_INFO) << "Will Listen on: " << ipaddr << std::endl;
    std::vector<std::string> machines;
    mpi_tools::all_gather(ipaddr, param.machines);
    // set defaults
    param.curmachineid = (procid_t)(mpi_tools::rank());

    param.numhandlerthreads = RPC_DEFAULT_NUMHANDLERTHREADS;
    param.commtype = commtype;
    param.initstring = param.initstring + std::string(" __sockhandle__=") + tostr(sock) + " ";
    return true;
#else
    std::cerr << "MPI Support not compiled!" << std::endl;
    exit(0);
#endif
}
Exemplo n.º 2
0
distributed_control::distributed_control() {
  dc_init_param initparam;
  if (init_param_from_env(initparam)) {
    logstream(LOG_INFO) << "Distributed Control Initialized from Environment" << std::endl;
  } else if (init_param_from_zookeeper(initparam)) {
      logstream(LOG_INFO) << "Distributed Control Initialized from Zookeeper" << std::endl;
  } else if (mpi_tools::initialized() && init_param_from_mpi(initparam)) {
      logstream(LOG_INFO) << "Distributed Control Initialized from MPI" << std::endl;
  }
  else {
    logstream(LOG_INFO) << "Shared Memory Execution" << std::endl;
    // get a port and socket
    std::pair<size_t, int> port_and_sock = get_free_tcp_port();
    size_t port = port_and_sock.first;
    int sock = port_and_sock.second;

    initparam.machines.push_back(std::string("localhost:") + tostr(port));
    initparam.curmachineid = 0;
    initparam.initstring = std::string(" __sockhandle__=") + tostr(sock) + " ";
    initparam.numhandlerthreads = RPC_DEFAULT_NUMHANDLERTHREADS;
    initparam.commtype = RPC_DEFAULT_COMMTYPE;
  }
  init(initparam.machines,
        initparam.initstring,
        initparam.curmachineid,
        initparam.numhandlerthreads,
        initparam.commtype);
  INITIALIZE_TRACER(dc_receive_queuing, "dc: time spent on enqueue");
  INITIALIZE_TRACER(dc_receive_multiplexing, "dc: time spent exploding a chunk");
  INITIALIZE_TRACER(dc_call_dispatch, "dc: time spent issuing RPC calls");
}
bool init_param_from_zookeeper(dc_init_param& param) {
#ifdef HAS_ZOOKEEPER
  char* zk_hosts = getenv("ZK_SERVERS");
  char* zk_jobname = getenv("ZK_JOBNAME");
  char* zk_numnodes = getenv("ZK_NUMNODES");
  if (zk_hosts == NULL || zk_jobname == NULL || zk_numnodes == NULL) {
    return false;
  }

  std::vector<std::string> zk_hosts_list = strsplit(zk_hosts, ",");

  // number of nodes to wait for
  size_t numnodes = atoi(zk_numnodes);
  ASSERT_GE(numnodes, 1);
  logstream(LOG_EMPH) << "Using Zookeeper for Initialization. Waiting for "
                      << numnodes << " to join" << std::endl;

  // generate a unique identifier for this server

  std::pair<size_t, int> port_and_sock = get_free_tcp_port();
  size_t port = port_and_sock.first;
  int sock = port_and_sock.second;
  std::string ipaddr = get_local_ip_as_str(true);
  ipaddr = ipaddr + ":" + tostr(port);
  logstream(LOG_INFO) << "Will Listen on: " << ipaddr << std::endl;

  // get an ip address
  zookeeper_util::server_list server_list(zk_hosts_list,
                                     zk_jobname,
                                     ipaddr);

  // final server list goes here
  std::vector<std::string> received_servers;
  // locks to product the final server list
  mutex lock;
  conditional cond;

  // construct the watch to watch for changes on zookeeper
  server_list.set_callback(boost::bind(zk_callback,
                                       _1,
                                       _2,
                                       _3,
                                       boost::ref(received_servers),
                                       numnodes,
                                       boost::ref(lock),
                                       boost::ref(cond)));

  server_list.join("graphlab");

  lock.lock();
  received_servers = server_list.watch_changes("graphlab");
  // wait until I get all the servers
  // TODO: add a timeout
  while(received_servers.size() < numnodes) cond.wait(lock);
  lock.unlock();

  // done!
  // now make sure that everyone sees the server list in the same order

  ASSERT_EQ(received_servers.size(), numnodes);
  std::sort(received_servers.begin(), received_servers.end());

  // now fill the parameter list
  param.machines = received_servers;
  param.curmachineid = std::find(received_servers.begin(), received_servers.end(),
                                 ipaddr) - received_servers.begin();
  ASSERT_LT(param.curmachineid, received_servers.size());
  param.numhandlerthreads = RPC_DEFAULT_NUMHANDLERTHREADS;
  param.commtype = RPC_DEFAULT_COMMTYPE;
  param.initstring = param.initstring + std::string(" __sockhandle__=") + tostr(sock) + " ";
  // detach from the server list
  // now, this takes advantage of the Zookeeper feature that
  // every machine sees all changes in the same order.
  // i.e. At some point, everyone would have seen a complete server list.
  // Once that happens, everyone can leave.
  server_list.set_callback(NULL);
  server_list.leave("graphlab");

  return true;
#else
  std::cerr << "Zookeeper Support not compiled!" << std::endl;
  exit(0);
#endif
}