Beispiel #1
0
void MapStatsMapper::_writeStats(HadoopPipes::MapContext& context, const MapStats& stats)
{
  LOG_INFO("node count: " << _nodeCount);
  LOG_INFO("way count: " << _wayCount);
  if (stats.isValid())
  {
    pp::Hdfs fs;
    int partition = context.getJobConf()->getInt("mapred.task.partition");
    string workDir = context.getJobConf()->get("mapred.work.output.dir");

    LOG_INFO("Stats: " << stats.toString());

    QString path = QString("%1/part-%2.stats").
        arg(QString::fromStdString(workDir)).
        arg(partition, 5, 10, QChar('0'));

    LOG_INFO("Writing to: " << path);
    boost::shared_ptr<ostream> osStats(fs.create(path.toStdString()));

    stats.write(*osStats);
  }
  else
  {
    LOG_INFO("Stats are not valid.");
  }
}
Beispiel #2
0
void WayJoinDriver::joinPointsToWays(QString input, QString out)
{
  // create a job
  Job job;

  job.setVerbose(Log::getInstance().getLevel() <= Log::Debug);
  // set the name
  job.setName("WayJoin2 " + input.toStdString());

  // set the input/output
  LOG_INFO("input: " << input);
  job.setInput(input.toStdString());
  job.setOutput(out.toStdString());

  // Pass the min/max values as parameters to the job.
  MapStats stats;
  stats.readDir(input);
  stats.write(job.getConfiguration());
  LOG_INFO("MapStats: " << stats.toString().toStdString());

  // Pass the maximum way size as a parameter to the job.
  job.getConfiguration().setDouble("hoot.max.way.size", _maxWaySize);
  // be nice and don't start the reduce tasks until most of the map tasks are done.
  job.getConfiguration().setDouble("mapred.reduce.slowstart.completed.maps", 0.98);

  job.getConfiguration().setInt(WayJoin2Mapper::elementStatusKey(), _newStatus.getEnum());
  job.getConfiguration().setLong(WayJoin2Mapper::nodeIdDeltaKey(), _nodeIdDelta);
  job.getConfiguration().setLong(WayJoin2Mapper::relationIdDeltaKey(), _relationIdDelta);
  job.getConfiguration().setLong(WayJoin2Mapper::wayIdDeltaKey(), _wayIdDelta);

  // setup the mapper and reducer classes.
  job.setMapperClass(WayJoin2Mapper::className());
  job.setReducerClass(WayJoin2Reducer::className());
  job.setInputFormatClass(WayJoin2InputFormat::className());
  job.setRecordReaderClass(WayJoin2RecordReader::className());
  job.setRecordWriterClass(PbfRecordWriter::className());

  // Adds all libraries in this directory to the job.
  job.addLibraryDirs(ConfigOptions().getHootHadoopLibpath());

  job.addFile(ConfPath::search("hoot.json").toStdString());

  // This library will be used to provide mapper/reducer classes and anything else referenced
  // by the factory.
  job.addPlugin(getenv("HOOT_HOME") + string("/lib/libHootHadoop.so.1"));

  _addDefaultJobSettings(job);

  // run the job.
  job.run();
}
Beispiel #3
0
void TileOpDriver::apply(QString in, vector<Envelope> envelopes, double buffer,
  QString out)
{
  // create a job
  pp::Job job;

  job.setVerbose(Log::getInstance().getLevel() <= Log::Debug);
  // set the name
  job.setName("TileOpDriver");

  // be nice and don't start the reduce tasks until most of the map tasks are done.
  job.getConfiguration().setDouble("mapred.reduce.slowstart.completed.maps", 0.98);

  // set the input/output
  pp::Hdfs fs;
  job.setInput(fs.getAbsolutePath(in.toStdString()));
  job.setOutput(fs.getAbsolutePath(out.toStdString()));

  if (_op == 0)
  {
    throw HootException("You must specify an operation.");
  }

  stringstream ss;
  ObjectOutputStream oos(ss);
  oos.writeObject(*_op);
  oos.flush();
  LOG_INFO("oos size: " << ss.str().size());
  job.getConfiguration().setBytes(TileOpReducer::opKey(), ss.str());

  job.getConfiguration().set(TileOpMapper::envelopesKey(), _toString(envelopes));
  job.getConfiguration().set(TileOpMapper::replacementsKey(),
    fs.getAbsolutePath(in.toStdString()));
  job.getConfiguration().setDouble(TileOpMapper::maxWaySizeKey(), buffer);
  job.getConfiguration().setDouble(TileOpMapper::bufferKey(), buffer);

  // read the max ids from in and write them to the configuration
  MapStats stats;
  stats.readDir(in);
  stats.write(job.getConfiguration());

  // setup the mapper and reducer classes.
  job.setMapperClass(TileOpMapper::className());
  job.setReducerClass(TileOpReducer::className());
  job.setInputFormatClass(PbfInputFormat::className());
  job.setRecordReaderClass(PbfRecordReader::className());
  job.setRecordWriterClass(PbfRecordWriter::className());

  // Adds all libraries in this directory to the job.
  job.addLibraryDirs(conf().getList("hoot.hadoop.libpath",
    "${HOOT_HOME}/lib/;${HOOT_HOME}/local/lib/;${HADOOP_HOME}/c++/Linux-amd64-64/lib/;"
    "${HOOT_HOME}/pretty-pipes/lib/"));

  LOG_INFO("Hoot home: " << conf().getString("foo", "${HOOT_HOME}"));

  const std::vector<std::string>& dirs = job.getLibraryDirs();
  for (size_t i = 0; i < dirs.size(); i++)
  {
    LOG_INFO("lib dir: " << dirs[i]);
  }

  job.addFile(ConfPath::search("hoot.json").toStdString());

  // if GDAL isn't installed on all nodes, then we'll need to copy over the projection info.
  QString gdalData = QString(getenv("GDAL_DATA"));
  if (gdalData != "")
  {
    QDir gdalDir(gdalData);
    if (gdalDir.exists() == false)
    {
      LOG_WARN("Could not find GDAL_DIR: " << gdalData);
    }
    else
    {
      QStringList filters;
      filters << "*.csv";
      QFileInfoList fil = gdalDir.entryInfoList(filters, QDir::Files);
      for (int i = 0; i < fil.size(); i++)
      {
        LOG_INFO("Adding GDAL_DATA file: " << fil[i].absoluteFilePath());
        job.addFile(fil[i].absoluteFilePath().toStdString());
      }
    }
  }


  // This library will be used to provide mapper/reducer classes and anything else referenced
  // by the factory.
  job.addPlugin(getenv("HOOT_HOME") + string("/lib/libHootHadoop.so.1"));

  // serialize all the configuration settings.
  job.getConfiguration().set(settingsConfKey().toStdString(),
                             conf().toString().toUtf8().constData());

  _addDefaultJobSettings(job);

  QStringList fileDeps = conf().getList(fileDepsKey(), QStringList());
  for (int i = 0; i < fileDeps.size(); i++)
  {
    job.addFile(fileDeps[i].toStdString());
  }

  // conflation runs can go for a _long_ time. Setting timeout to 6 hours.
  job.getConfiguration().setInt("mapred.task.timeout", 6 * 3600 * 1000);

  // run the job.
  job.run();
}