Example #1
0
void P2PSync<Dtype>::run(shared_ptr<Solver<Dtype> > root,
                         const vector<int>& gpus) {
  // Pair devices for map-reduce synchronization
  vector<DevicePair> pairs;
  DevicePair::compute(gpus, &pairs);
  ostringstream s;
  for (int i = 1; i < pairs.size(); ++i) {
    s << (i == 1 ? "" : ", ") << pairs[i].parent() << ":" << pairs[i].device();
  }
  LOG(INFO)<< "GPUs pairs " << s.str();

  SolverParameter param(root->param());
  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
  syncs[0].reset(new P2PSync<Dtype>(root, NULL, param));

  // Build the GPU tree by finding the parent for each solver
  for (int attempts = 0; attempts < pairs.size(); ++attempts) {
    for (int i = 1; i < pairs.size(); ++i) {
      if (!syncs[i].get()) {
        P2PSync<Dtype>* parent = NULL;
        for (int j = 0; j < syncs.size(); ++j) {
          if (syncs[j]) {
            const SolverParameter& p = syncs[j]->solver()->param();
            if (p.device_id() == pairs[i].parent()) {
              parent = (P2PSync<Dtype>*) syncs[j].get();
            }
          }
        }
        if (parent) {
          param.set_device_id(pairs[i].device());
          syncs[i].reset(new P2PSync<Dtype>(root, parent, param));
          parent->children_.push_back((P2PSync<Dtype>*) syncs[i].get());
        }
      }
    }
  }

  LOG(INFO)<< "Starting Optimization";

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StartInternalThread();
  }

  // Run root solver on current thread
  syncs[0]->solver_->Solve();

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StopInternalThread();
  }
}
Example #2
0
void P2PSync<Dtype>::Run(const vector<int>& gpus) {
  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
  Prepare(gpus, &syncs);

  LOG(INFO)<< "Starting Optimization";

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StartInternalThread();
  }

  // Run root solver on current thread
  solver_->Solve();

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StopInternalThread();
  }
}
Example #3
0
void MiniCluster<Dtype>::run(shared_ptr<Solver<Dtype> > root_solver,
                             const vector<int>& gpus,
                             int total_gpus) {
#ifdef INFINIBAND
  RDMAAdapter adapter;
  LOG(INFO) << "Found RDMA adapter " << adapter.name();

  // Create channel for each peer
  vector<shared_ptr<RDMAChannel> > peers(size_);
  for (int i = 0; i < size_; ++i) {
    if (i != rank_) {
      peers[i].reset(new RDMAChannel(adapter));
    }
  }
  // Connect channels all to all
  for (int i = 0; i < size_; ++i) {
    vector<string> addresses(1);
    if (i != rank_) {
      addresses[0] = peers[i]->address();
    }
    AllGather(&addresses);
    for (int j = 0; j < addresses.size(); ++j)
      LOG(INFO) << addresses[j];
    if (i == rank_) {
      for (int j = 0; j < size_; ++j) {
        if (j != rank_) {
          peers[j]->Connect(addresses[j]);
        }
      }
    }
  }
  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
  // RDMASync will create all necessary buffers
  syncs[0].reset(new RDMASync<Dtype>(root_solver, peers, rank_));
#else
  // Create channel for each peer
  vector<shared_ptr<SocketChannel> > peers(size_);
  for (int i = 0; i < size_; ++i) {
    if (i != rank_) {
      peers[i].reset(new SocketChannel());
    }
  }

  SocketAdapter adapter(&peers);
  usleep(10000);
  // Get all channels to connect to
  vector<string> addresses(1);
  // Set local address to send to master in AllGather.
  // If you are master, you still need to set it, so
  // that it is sent to everyone during regular broadcast in AllGather
  addresses[0] = adapter.address();
  LOG(INFO) << "Adapter address " << adapter.address().c_str();
  AllGather(&addresses);
  for (int j = 0; j < addresses.size(); ++j)
    LOG(INFO) << "ADDRESS [" << addresses.at(j).c_str() << "]";

  // Connect to all channnels
  for (int j = 0; j < size_; ++j) {
    if (j != rank_) {
      LOG(INFO) << "Connecting to [" << addresses[j].c_str() << "]";
      peers[j]->Connect(addresses[j]);
    }
  }

#ifndef CPU_ONLY
  vector<shared_ptr<P2PSync<Dtype> > > syncs(gpus.size());
  syncs[0].reset(new SocketSync<Dtype>(root_solver, peers, rank_));
#else
  vector<shared_ptr<P2PSyncCPU<Dtype> > > syncs(1);
  syncs[0].reset(new SocketSyncCPU<Dtype>(root_solver, peers, rank_));
#endif
#endif

#ifndef CPU_ONLY
  syncs[0]->prepare(gpus, &syncs);
  LOG(INFO)<< "Starting Optimization";

  // Switch to total number of GPUs once the datareaders are ready
  Caffe::set_solver_count(total_gpus);
  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StartInternalThread();
  }

  // Run root solver on current thread
  syncs[0]->solver()->Solve();

  for (int i = 1; i < syncs.size(); ++i) {
    syncs[i]->StopInternalThread();
  }
#else
  Caffe::set_solver_count(1);
  LOG(INFO) << "Starting solver...";
  syncs[0]->solver()->Solve();
#endif

}