Example #1
0
int run(int argc, char** argv)
{
  auto configDirs = buildConfigurationDirectoriesList(QStringList({"lima"}),
                                                      QStringList());
  auto configPath = configDirs.join(LIMA_PATH_SEPARATOR);

  auto resourcesDirs = buildResourcesDirectoriesList(QStringList({"lima"}),
                                                     QStringList());
  auto resourcesPath = resourcesDirs.join(LIMA_PATH_SEPARATOR);

  QsLogging::initQsLog(configPath);
  // Necessary to initialize factories
  Lima::AmosePluginsManager::single();
  if (!Lima::AmosePluginsManager::changeable().loadPlugins(configPath))
  {
    throw InvalidConfiguration("loadLibrary method failed.");
  }
  //   std::cerr << "Amose plugins initialized" << std::endl;

  std::string strResourcesPath;
  std::string lpConfigFile;
  std::string commonConfigFile;
  std::string clientId;
  std::vector<std::string> languages;
  std::vector<std::string> dumpersv;
  std::vector<std::string> outputsv;
  std::string pipeline;
  std::vector<std::string> files;
  std::vector<std::string> vinactiveUnits;
  std::string meta;
  std::string splitMode;
  std::string strConfigPath;


  po::options_description desc("Usage");
  desc.add_options()
  ("help,h", "Display this help message")
  ("version,v",
   QString::fromUtf8("Shows LIMA version: %1.").arg(LIMA_VERSION).toUtf8().constData())
  ("language,l", po::value< std::vector<std::string> >(&languages),
   "supported languages trigrams")
  ("dumper,d",
   po::value< std::vector<std::string> >(&dumpersv),
   "a dumper to use, can be repeated. Valid values are [bow (BowTextWriter),bowh (BowTextHandler),fullxml (SimpleStreamHandler),text (SimpleStreamHandler), event (EventHandler)]. To use any of them, the corresponding dumper must be available in the pipeline configuration. Default is bow but must be set if any other is set")
  ("output,o",
   po::value< std::vector<std::string> >(&outputsv),
   "where to write dumpers output. By default, each dumper writes its results on a file whose name is the input file with a predefined suffix  appended. This option allows to chose another suffix or to write on standard output. Its syntax  is the following: <dumper>:<destination> with <dumper> a  dumper name and destination, either the value 'stdout' or a suffix.")
  ("mm-core-client",
   po::value<std::string>(&clientId)->default_value("lima-coreclient"),
   "Set the linguistic processing client to use")
  ("resources-dir", po::value<std::string>(&strResourcesPath),
   "Set the directory containing the LIMA linguistic resources")
  ("config-dir", po::value<std::string>(&strConfigPath),
   "Set the directory containing the (LIMA) configuration files")
  ("common-config-file",
   po::value<std::string>(&commonConfigFile)->default_value("lima-common.xml"),
   "Set the LIMA common libraries configuration file to use")
  ("lp-config-file",
   po::value<std::string>(&lpConfigFile)->default_value("lima-analysis.xml"),
   "Set the linguistic processing configuration file to use")
  ("pipeline,p",
   po::value< std::string >(&pipeline)->default_value("main"),
   "Set the linguistic analysis pipeline to use")
  ("input-file",
   po::value< std::vector<std::string> >(&files),
   "Set a text file to analyze")
  ("inactive-units",
   po::value< std::vector<std::string> >(&vinactiveUnits),
   "Inactive some process units of the used pipeline")
  ("availableUnits",
   "Ask the program to list its known processing units")
  ("meta",
   po::value< std::string >(&meta),
   "Sets metadata values, in the format data1:value1,data2:value2,...")
  ("split-mode,s",
   po::value< std::string >(&splitMode)->default_value("none"),
   "Split input files depending on this value and analyze each part independently. Possible values are 'none' (default) and 'lines' to split on each line break. Later, 'para' will be added to split on paragraphs (empty lines). For values different of 'none', dumpers should probably be on append mode.")
  ;

  po::positional_options_description p;
  p.add("input-file", -1);

  po::variables_map vm;
  try
  {
    po::store(po::command_line_parser(argc, argv).
    options(desc).positional(p).run(), vm);
    po::notify(vm);
  }
  catch (const boost::program_options::unknown_option& e)
  {
    std::cerr << e.what() << std::endl;
    return 1;
  }
  if (vm.count("help"))
  {
    std::cout << desc << std::endl;
    return SUCCESS_ID;
  }
  else if (vm.count("version"))
  {
    std::cout << LIMA_VERSION << std::endl;
    return SUCCESS_ID;
  }

  if (!strResourcesPath.empty())
  {
    resourcesPath = QString::fromUtf8(strResourcesPath.c_str());
    resourcesDirs = resourcesPath.split(LIMA_PATH_SEPARATOR);
  }
  if (!strConfigPath.empty())
  {
    configPath = QString::fromUtf8(strConfigPath.c_str());
    configDirs = configPath.split(LIMA_PATH_SEPARATOR);
  }
  std::deque<std::string> langs(languages.size());
  std::copy(languages.begin(), languages.end(), langs.begin());

  std::set<std::string> dumpers;
  if (dumpersv.empty())
  {
    dumpers.insert("text");
  }
  else
  {
    for (uint64_t i = 0; i < dumpersv.size(); i++)
    {
      dumpers.insert(dumpersv[i]);
    }
  }

  if (vm.count("availableUnits"))
  {
    listunits();
    return SUCCESS_ID;
  }
  if (langs.size()<1)
  {
    std::cerr << "no language defined !" << std::endl;
    return 1;
  }

  QMap< QString, QString > outputs;
  for(std::vector<std::string>::const_iterator outputsIt = outputsv.begin();
      outputsIt != outputsv.end(); outputsIt++)
  {
    QStringList output = QString::fromUtf8((*outputsIt).c_str()).split(":");
    if (output.size()==2)
    {
      outputs[output[0]] = output[1];
    }
    else
    {
      // Option syntax  error
      std::cerr << "syntax error in output setting:" << *outputsIt << std::endl;
    }
  }
  std::vector<std::pair<std::string,std::string> > userMetaData;
  // parse 'meta' argument to add metadata
  if(!meta.empty())
  {
    std::string metaString(meta);
    std::string::size_type k=0;
    do
    {
      k=metaString.find(",");
      //if (k==std::string::npos) continue;
      std::string str(metaString,0,k);
      std::string::size_type i=str.find(":");
      if (i==std::string::npos)
      {
        std::cerr << "meta argument '"<< str
                  << "' is not of the form XXX:YYY: ignored" << std::endl;
      }
      else
      {
        //std::cout << "add metadata " << std::string(str,0,i) << "=>" << std::string(str,i+1) << std::endl;
        userMetaData.push_back(std::make_pair(std::string(str,0,i),
                                              std::string(str,i+1)));
      }
      if (k!=std::string::npos)
      {
        metaString=std::string(metaString,k+1);
      }
    }
    while (k!=std::string::npos);
  }

  std::set<std::string> inactiveUnits;
  for (const auto & inactiveUnit : vinactiveUnits)
  {
    inactiveUnits.insert(inactiveUnit);
  }
  std::deque<std::string> pipelines;

  pipelines.push_back(pipeline);

  uint64_t beginTime=TimeUtils::getCurrentTime();

  // initialize common
  Common::MediaticData::MediaticData::changeable().init(
    resourcesPath.toUtf8().constData(),
    configPath.toUtf8().constData(),
    commonConfigFile,
    langs);

  bool clientFactoryConfigured = false;
  Q_FOREACH(QString configDir, configDirs)
  {
    if (QFileInfo::exists(configDir + "/" + lpConfigFile.c_str()))
    {
      // initialize linguistic processing
      Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(
          (configDir + "/" + lpConfigFile.c_str()));
      LinguisticProcessingClientFactory::changeable().configureClientFactory(
        clientId,
        lpconfig,
        langs,
        pipelines);
      clientFactoryConfigured = true;
      break;
    }
  }
  if(!clientFactoryConfigured)
  {
//     std::cerr << "No LinguisticProcessingClientFactory were configured with" << configDirs.join(LIMA_PATH_SEPARATOR).toStdString() << "and" << lpConfigFile << std::endl;
    return EXIT_FAILURE;
  }

  std::shared_ptr< AbstractLinguisticProcessingClient > client =
      std::dynamic_pointer_cast<AbstractLinguisticProcessingClient>(
          LinguisticProcessingClientFactory::single().createClient(clientId));

  // Set the handlers
  std::map<std::string, AbstractAnalysisHandler*> handlers;
  BowTextWriter* bowTextWriter = 0;
  EventAnalysis::EventHandler* eventHandler = 0;
  BowTextHandler* bowTextHandler = 0;
  SimpleStreamHandler* simpleStreamHandler = 0;
  SimpleStreamHandler* fullXmlSimpleStreamHandler = 0;
  LTRTextHandler* ltrTextHandler=0;

  if (dumpers.find("event") != dumpers.end())
  {
    eventHandler = new EventAnalysis::EventHandler();
    handlers.insert(std::make_pair("eventHandler",
                                   eventHandler));
  }
  if (dumpers.find("bow") != dumpers.end())
  {
    bowTextWriter = new BowTextWriter();
    handlers.insert(std::make_pair("bowTextWriter",
                                   bowTextWriter));
  }
  if (dumpers.find("bowh") != dumpers.end())
  {
    bowTextHandler = new BowTextHandler();
    handlers.insert(std::make_pair("bowTextHandler",
                                   bowTextHandler));
  }
  if (dumpers.find("text") != dumpers.end())
  {
    simpleStreamHandler = new SimpleStreamHandler();
    handlers.insert(std::make_pair("simpleStreamHandler",
                                   simpleStreamHandler));
  }
  if (dumpers.find("fullxml") != dumpers.end())
  {
    fullXmlSimpleStreamHandler = new SimpleStreamHandler();
    handlers.insert(std::make_pair("fullXmlSimpleStreamHandler",
                                   fullXmlSimpleStreamHandler));
  }
  if (dumpers.find("ltr") != dumpers.end())
  {
    ltrTextHandler= new LTRTextHandler();
    handlers.insert(std::make_pair("ltrTextHandler",
                                   ltrTextHandler));
  }

  std::map<std::string,std::string> metaData;

  metaData["Lang"]=langs[0];
  for (const auto& meta : userMetaData)
  {
    metaData[meta.first] = meta.second;
  }

  uint64_t i=1;
  for (std::vector<std::string>::iterator fileItr=files.begin();
       fileItr!=files.end();
  fileItr++, i++)
  {
    // display the progress of the analysis
    std::cerr << "\rAnalyzing "<< i << "/" << files.size()
              << " ("  << std::setiosflags(std::ios::fixed)
              << std::setprecision(2) << (i*100.0/files.size()) <<"%) '"
              << *fileItr << "'" << std::flush;

    // set the output files (to 0 if not in list)
    // remember to call closeHandlerOutputFile for each call to openHandlerOutputFile
    QString bowOut = outputs.contains("bow")
        ? (outputs["bow"] == "stdout"
            ? "stdout"
            : QString::fromUtf8((*fileItr).c_str())+outputs["bow"])
        : QString::fromUtf8((*fileItr).c_str())+".bin";
    std::ostream* bowofs  = openHandlerOutputFile(bowTextWriter,
                                                  std::string(bowOut.toUtf8().constData()),
                                                  dumpers,
                                                  "bow");
    QString textOut = outputs.contains("text")
        ? (outputs["text"] == "stdout"
            ? "stdout"
            : QString::fromUtf8((*fileItr).c_str())+outputs["text"])
        : "stdout";
    std::ostream* txtofs  = openHandlerOutputFile(simpleStreamHandler,
                                                  std::string(textOut.toUtf8().constData()),
                                                  dumpers,
                                                  "text");
    QString fullxmlOut = outputs.contains("fullxml")
        ? (outputs["fullxml"] == "stdout"
            ? "stdout"
            : QString::fromUtf8((*fileItr).c_str())+outputs["fullxml"])
        : "stdout";
    std::ostream* fullxmlofs  = openHandlerOutputFile(fullXmlSimpleStreamHandler,
                                                      std::string(fullxmlOut.toUtf8().constData()),
                                                      dumpers,
                                                      "fullxml");

    // loading of the input file
    TimeUtils::updateCurrentTime();
    QFile file(fileItr->c_str());
    if (!file.open(QIODevice::ReadOnly | QIODevice::Text))
    {
      std::cerr << "Cannot open file " << *fileItr << " ! " << std::endl;
      continue;
    }
    metaData["FileName"]=*fileItr;

    if (splitMode == "lines")
    {
      int lineNum = 0, nbLines = 0;
      std::cerr << "Counting number of lines…";
      while (!file.atEnd())
      {
        file.readLine();
        nbLines++;
      }
      file.seek(0);

      QTextStream in(&file);
      std::cerr << "\rStarting analysis";
      while (!in.atEnd())
      {
        lineNum++;
        QString percent = QString::number((lineNum*1.0/nbLines*100),'f',2);
        QString contentText = in.readLine();
        if ( (lineNum % 100) == 0)
        {
          std::cerr << "\rAnalyzed "<< lineNum << "/" << nbLines
                    << " (" << percent.toUtf8().constData()
                    << "%) lines. At " << file.pos();
        }
        // analyze it
        client->analyze(contentText,
                        metaData,
                        pipeline,
                        handlers,
                        inactiveUnits);
      }
      file.close();
    }
    else // default == none
    {
      QString contentText = QString::fromUtf8(file.readAll().constData());
      file.close();
      if (contentText.isEmpty())
      {
        std::cerr << "file " << *fileItr << " has empty input ! " << std::endl;
        continue;
      }

      // The input text MUST be UTF-8 encoded !!!
      TimeUtils::logElapsedTime("ReadInputFile");
      TimeUtils::updateCurrentTime();

      // analyze it
      client->analyze(contentText,metaData, pipeline, handlers, inactiveUnits);
    }

    // Close and delete opened output files
    closeHandlerOutputFile(bowofs);
    closeHandlerOutputFile(txtofs);
    closeHandlerOutputFile(fullxmlofs);
  }
  std::cout << std::endl;
  // free handlers
  if (eventHandler != 0)
    delete eventHandler;
  if (bowTextWriter!= 0)
    delete bowTextWriter;
  if (simpleStreamHandler!= 0)
    delete simpleStreamHandler;
  if (fullXmlSimpleStreamHandler!= 0)
    delete fullXmlSimpleStreamHandler;
  if (bowTextHandler!= 0) {
    // not handled in output file: just print on output (this should just be used for testing)
    std::cout << bowTextHandler->getBowText();
    delete bowTextHandler;
  }
  if (ltrTextHandler!= 0) {
    // not handled in output file: just print on output (this should just be used for testing)
    std::cout << ltrTextHandler->getLTRText();
    delete ltrTextHandler;
  }
  TIMELOGINIT;
  LINFO << "Total: "
        << TimeUtils::diffTime(beginTime,TimeUtils::getCurrentTime())
        << " ms";
  TimeUtils::logAllCumulatedTime("Cumulated time.");

  return SUCCESS_ID;
}
Example #2
0
int dowork(int argc,char* argv[])
{

  string resourcesPath=string(getenv("LIMA_RESOURCES"));
  string configDir=string(getenv("LIMA_CONF"));
  string lpConfigFile=string("lima-analysis.xml");
  string commonConfigFile=string("lima-common.xml");
  string pipeline=string("normalization");
  string clientId=string("lima-coreclient");

  bool printCategs=false;

  deque<string> langs;
  deque<string> files;

  if (argc>1)
  {
    for (int i = 1 ; i < argc; i++)
    {
      std::string arg(argv[i]);
      std::string::size_type pos = std::string::npos;
      if ( arg[0] == '-' )
      {
        if (arg == "--help")
          usage(argc, argv);
        else if (arg== "--printCategs")
          printCategs=true;
        else if (arg== "--availableUnits")
          listunits();
        else if (arg== "--catch") ;
        else if ( (pos = arg.find("--lp-config-file=")) != std::string::npos )
          lpConfigFile = arg.substr(pos+14);
        else if ( (pos = arg.find("--common-config-file=")) != std::string::npos )
          commonConfigFile = arg.substr(pos+20);
        else if ( (pos = arg.find("--config-dir=")) != std::string::npos )
          configDir = arg.substr(pos+13);
        else if ( (pos = arg.find("--resources-dir=")) != std::string::npos )
          resourcesPath = arg.substr(pos+16);
        else if ( (pos = arg.find("--language=")) != std::string::npos )
          langs.push_back(arg.substr(pos+11));
//         else if ( (pos = arg.find("--pipeline=")) != std::string::npos )
//           pipeline = arg.substr(pos+11);
        else if ( (pos = arg.find("--client=")) != std::string::npos )
          clientId=arg.substr(pos+9);
        else usage(argc, argv);
      }
      else
      {
        files.push_back(arg);
      }
    }
  }

  if (langs.size()<1)
  {
    std::cerr << "no language defined !" << std::endl;
    return -1;
  }

  AbstractLinguisticProcessingClient* client(0);

  try
  {

    // initialize common
    MediaticData::changeable().init(
      resourcesPath,
      configDir,
      commonConfigFile,
      langs);

    // initialize linguistic processing
    deque<string> pipelines;
    pipelines.push_back(pipeline);
    Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser lpconfig(configDir + "/" + lpConfigFile);
    LinguisticProcessingClientFactory::changeable().configureClientFactory(
      clientId,
      lpconfig,
      langs,
      pipelines);

    client=dynamic_cast<AbstractLinguisticProcessingClient*>(LinguisticProcessingClientFactory::single().createClient(clientId));
    
    // Set the handlers
    std::map<std::string, AbstractAnalysisHandler*> handlers;
    BowTextHandler bowTextHandler;
    handlers.insert(std::make_pair("bowTextHandler", &bowTextHandler));
    
    map<string,string> metaData;
    metaData["Lang"]=langs[0];
    MediaId lang=MediaticData::single().getMediaId(langs[0]);

    for (deque<string>::iterator fileItr=files.begin();
         fileItr!=files.end();
         fileItr++)
    {
      // open the output file
      ostringstream os;
      os << *fileItr << ".norm";
      ofstream fout(os.str().c_str(), std::ofstream::binary);

      // loading of the input file
      TimeUtils::updateCurrentTime();
      std::ifstream file(fileItr->c_str(), std::ifstream::binary);
      char buf[256];
      file.getline(buf,256);
      std::string line(buf);
      while (!file.eof())
      {
        if (line.size()==0)
        {
          file.getline(buf,256);
          line=string(buf);
          continue;
        }
        //        cout << "normalize " << line << endl;
        LimaString contentText;
        // The input text MUST be UTF-8 encoded !!!
        contentText = utf8stdstring2limastring(line);

        // analyze it
        metaData["FileName"]=*fileItr;

        client->analyze(contentText,metaData,pipeline,handlers);

        // analyze resulting bowText to extract normalization
        multimap<LimaString,string> norms=extractNormalization(contentText,bowTextHandler.getBowText(),lang);
        if (norms.empty())
        {
          norms.insert(make_pair(contentText,"NONE_1"));
        }
        for (multimap<LimaString,string>::iterator it=norms.begin();it!=norms.end();it++)
        {
          fout << limastring2utf8stdstring(it->first);
          if (printCategs)
          {
            fout << "#" << it->second;
          }
          fout << ";";
        }
        fout << endl;
        // read next line
        file.getline(buf,256);
        line=string(buf);
      }
    }
  }
  catch (InvalidConfiguration& e)
  {
    throw e;
  }

  delete client;
  return SUCCESS_ID;
}