Example #1
0
// read part of structured document from a file, using the document buffer given as argument
void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& reader)
{
  switch (param.outputFormat)
  {
    case TEXT:
    {
      TextWriterBoWDocumentHandler writer(cout);
      while (! fileIn.eof())
      {
        reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator);
      }
      break;
    }
    case XML:
    {
      BoWXMLWriter writer(cout);
      writer.writeBoWDocumentsHeader();
      while (! fileIn.eof())
      {
        reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator);
      }
      writer.writeBoWDocumentsFooter();
    }
    case STAT:
    {
      SBoWStatWriter writer;
      while (! fileIn.eof())
      {
        reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator);
      }
      cout << writer << endl;
      break;
    }
  }
}
Example #2
0
void BagOfWordsTest3::test_BoWText2()
{
  qDebug() << "BagOfWordsTest3::test_BoWText2";
  QString name;

  std::stringstream stream;
  {
    boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5));
    bt1->setInflectedForm(QString::fromUtf8("lemma"));
    boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12));
    bt2->setInflectedForm(QString::fromUtf8("démocratique"));
    boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4));
    bt3->setInflectedForm(QString::fromUtf8("word"));

    boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4));
    bt4->setInflectedForm(QString::fromUtf8("autre"));

    boost::shared_ptr< BoWTerm > term(new BoWTerm);
    term->addPart(bt1);
    boost::shared_ptr< BoWTerm > term2(new BoWTerm);
    term2->addPart(bt2);
    term2->addPart(bt3);
    term->addPart(term2);

    boost::shared_ptr< BoWNamedEntity > ne(new BoWNamedEntity);
    ne->setNamedEntityType(EntityType(EntityTypeId(1),EntityGroupId(3)));
    ne->setFeature("surname","John");
    ne->setFeature("name","Doe");
    name = ne->getFeatures().at("name");
    boost::shared_ptr< BoWToken > btsurname(new BoWToken(QString::fromUtf8("John"), 1, 10, 5));
    bt1->setInflectedForm(QString::fromUtf8("John"));
    boost::shared_ptr< BoWToken > btname(new BoWToken(QString::fromUtf8("Doe"), 1, 10, 12));
    bt2->setInflectedForm(QString::fromUtf8("Doe"));
    ne->addPart(btsurname);
    ne->addPart(btname);

    BoWText text;
    text.push_back(term);
    text.push_back(bt4);
    text.push_back(ne);

    BoWBinaryWriter writer;
    writer.writeBoWText(stream, text);
  }

  BoWText text;
  BoWBinaryReader reader;
  reader.readBoWText(stream, text);

  QVERIFY( text.size() == 3 );
  boost::shared_ptr<BoWNamedEntity> rne = boost::dynamic_pointer_cast<BoWNamedEntity>(text[2]);
  QVERIFY( rne->getParts().size() == 2 );

  QVERIFY( name == rne->getFeatures().at("name") );
}
Example #3
0
//! @brief read a text Bow file and fills a lexicon
//! @param fileIn the file to read
//! @param reader the file reader
//! @param lex the lexicon to fill
//! @param propertyAccessor
//! @param referenceProperties
void readBowFileText(ifstream& fileIn,
                     BoWBinaryReader& reader,
                     Lexicon& lex,
                     const PropertyAccessor& propertyAccessor,
                     set<LinguisticCode>& referenceProperties
                    )
{
    BoWText text;
    reader.readBoWText(fileIn,text);

    bool filterCategory = false;
    if ( referenceProperties.size() > 0 ) {
        filterCategory = true;
    }

    BoWTokenIterator it(text);
    while (! it.isAtEnd()) {
        const BoWToken& token = *(it.getElement());
        if (filterCategory) {
            set<LinguisticCode>::const_iterator referencePropertyIt =
                referenceProperties.find(propertyAccessor.readValue(token.getCategory()));
            if ( referencePropertyIt != referenceProperties.end() ) {
                lex.add(getStringDecomp(&token),token.getIndexString());
            }
        }
        else {
            lex.add(getStringDecomp(&token),token.getIndexString());
        }
        it++;
    }
}
Example #4
0
void BowTextHandler::endAnalysis()
{
  m_writer->endAnalysis();
  istringstream in(m_bowstream->str());
  BoWBinaryReader reader;
  reader.readHeader(in);
  reader.readBoWText(in,m_bowtext);

  // Insertion du bowtext au handler de contenu et de structure
  set_LastContentId(get_LastContentId()+1);
  CONTENT_ID contentId=get_LastContentId();
  ContentHandler< BoWText >::addContent(contentId,m_bowtext);
  Structure structure(1);
  // TODO: replace BOW_TEXT_NAME2 with some consistent value
  std::string BOW_TEXT_NAME2("BOW_TEXT_NAME2");
  Node node(BOW_TEXT_NAME2,1,1,contentId,2, "" , "text" , "bow" , "bow" , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 );
  add_Node(node);
}
Example #5
0
void BagOfWordsTest3::test_BoWText()
{
  qDebug() << "BagOfWordsTest3::test_BoWText";
  QString bt1Lemma;

  std::stringstream stream;
  {
    boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5));
    bt1->setInflectedForm(QString::fromUtf8("lemma"));
    boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12));
    bt2->setInflectedForm(QString::fromUtf8("démocratique"));
    boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4));
    bt3->setInflectedForm(QString::fromUtf8("word"));

    boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4));
    bt4->setInflectedForm(QString::fromUtf8("autre"));

    bt1Lemma = bt1->getLemma();

    boost::shared_ptr< BoWTerm > term(new BoWTerm);
    term->addPart(bt1);
    boost::shared_ptr< BoWTerm > term2(new BoWTerm);
    term2->addPart(bt2);
    term2->addPart(bt3);
    term->addPart(term2);

    BoWText text;
    text.push_back(term);
    text.push_back(bt4);

    BoWBinaryWriter writer;
    writer.writeBoWText(stream, text);
  }

  BoWText text;
  BoWBinaryReader reader;
  reader.readBoWText(stream, text);

  QVERIFY( text.size() == 2 );
  boost::shared_ptr<BoWTerm> rterm = boost::dynamic_pointer_cast<BoWTerm>(text[0]);
  QVERIFY( rterm->getParts().size() == 2 );

  QVERIFY( bt1Lemma == rterm->getParts()[0].getBoWToken()->getLemma() );
}
Example #6
0
// read documents in a file, using the document buffer given as argument
// (can be BoWDocument* or BoWDocumentST*)
void readDocuments(ifstream& fileIn, BoWDocument* document,
                   BoWBinaryReader& reader,
                   Lexicon& lex,
                   const PropertyManager& macroManager,
                   const PropertyAccessor& propertyAccessor,
                   set<LinguisticCode>& referenceProperties )
{
    bool filterCategory = false;
//  LinguisticCode referenceProperty;
    if ( referenceProperties.size() > 0 ) {
//    referenceProperty =  *(referenceProperties.begin());
        filterCategory = true;
    }

    GetLexiconBoWDocumentHandler handler(lex,macroManager,propertyAccessor,
                                         referenceProperties,filterCategory);
    while (! fileIn.eof())
    {
        reader.readBoWDocumentBlock(fileIn,*document,handler,true);
    }
}
Example #7
0
//**********************************************************************
//
// M A I N
//
//**********************************************************************
int main(int argc, char *argv[])
{
  QCoreApplication a(argc, argv);
  QsLogging::initQsLog();
  if (argc<1) {
        cerr << USAGE;
        return EXIT_FAILURE;
    }
    QsLogging::initQsLog();
    readCommandLineArguments(argc,argv);
    if (param.help) {
        cerr << HELP;
        return EXIT_FAILURE;
    }


    string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES"));
    string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF"));

    if ( (!param.language.size()) && (!param.codeFile.size()) ) {
        cerr << "no codefile nor language specified !" << endl;
        cerr << "Use e.g option '-l fre'." << endl;
        cerr << "Option '-h' gives full help" << endl;
        return EXIT_FAILURE;
    }
    else if ( param.language.size() ) {
        param.codeFile=resourcesPath+"/LinguisticProcessings/"+param.language+"/code-"+param.language+".xml";
    }

    cerr << "read proccodeManager from file " << param.codeFile << "..." << endl;
    PropertyCodeManager propcodemanager;
    propcodemanager.readFromXmlFile(param.codeFile);
    cerr << "get macroManager..." << endl;
    const PropertyManager& macroManager = propcodemanager.getPropertyManager("MACRO");
    const PropertyAccessor& propertyAccessor = macroManager.getPropertyAccessor();
    set<LinguisticCode> referenceProperties;
    for ( std::vector<string>::const_iterator macro = param.macro.begin() ;
            macro != param.macro.end() ; macro++ ) {
        cerr << "referenceProperties.insert(" << *macro << ")" << endl;
        LinguisticCode referenceProperty = macroManager.getPropertyValue(*macro);
        referenceProperties.insert(referenceProperty);
    }

    cerr << "referencePropertySet= ";
    set<LinguisticCode>::iterator propIt = referenceProperties.begin();
    if ( propIt != referenceProperties.end() ) {
        const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt);
        cerr << symbol;
        propIt++;
    }
    for ( ; propIt != referenceProperties.end() ; propIt++ ) {
        const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt);
        cerr << ", " << symbol;
    }
    cerr << endl;

    Lexicon lex;

    // read all files and count terms
    vector<string>::const_iterator
    file=param.inputFiles.begin(),
         file_end=param.inputFiles.end();
    for (;file!=file_end; file++) {

        ifstream fileIn((*file).c_str(), std::ifstream::binary);
        if (! fileIn) {
            cerr << "cannot open input file [" << *file << "]" << endl;
            continue;
        }
        BoWBinaryReader reader;
        try {
            reader.readHeader(fileIn);
        }
        catch (exception& e) {
            cerr << "Error: " << e.what() << endl;
            return EXIT_FAILURE;
        }

        switch (reader.getFileType()) {
        case BOWFILE_TEXT: {
            cerr << "Build lexicon from BoWText [" << *file << "]" << endl;
            try {
                readBowFileText(fileIn,reader, lex, propertyAccessor, referenceProperties);
            }
            catch (exception& e) {
                cerr << "Error: " << e.what() << endl;
            }
            break;
        }
        case BOWFILE_DOCUMENTST: {
            cerr << "ReadBoWFile: file contains a BoWDocumentST  -> not treated" << endl;
        }
        case BOWFILE_DOCUMENT: {
            cerr << "ReadBoWFile: build BoWdocument from  " << *file<< endl;
            BoWDocument* document=new BoWDocument();
            try {
                cerr << "ReadBoWFile: extract terms... " << endl;
                readDocuments(fileIn,document,reader, lex, macroManager, propertyAccessor, referenceProperties);
            }
            catch (exception& e) {
                cerr << "Error: " << e.what() << endl;
            }
            fileIn.close();
            delete document;
            break;
        }
        default: {
            cerr << "format of file " << reader.getFileTypeString() << " not managed"
                 << endl;
            return EXIT_FAILURE;
        }
        }
    }

    // output stream (default is 'cout')
    std::ostream *s_out;

    // Manage output
    if ( param.outputFilename.length() == 0) s_out=&std::cout;
    else s_out = new std::ofstream(param.outputFilename.c_str(), std::ios_base::out | std::ios_base::binary | std::ios_base::trunc);

    // output lexicon
    Lexicon::const_iterator
    w=lex.begin(),
      w_end=lex.end();
    for (;w!=w_end; w++) {
        (*s_out) << Common::Misc::limastring2utf8stdstring((*w).second.second) << "|"
        << Common::Misc::limastring2utf8stdstring((*w).first) << "|"
        << (*w).second.first << endl;
    }

    // Close output file (if any)
    if (  param.outputFilename.length() != 0)
        dynamic_cast<std::ofstream*>(s_out)->close();

    return EXIT_SUCCESS;
}
Example #8
0
//**********************************************************************
//
// M A I N
//
//**********************************************************************
int main(int argc, char *argv[])
{
  QCoreApplication a(argc, argv);
  QsLogging::initQsLog();
  if (argc<1) {    cerr << USAGE; exit(1); }
  readCommandLineArguments(argc,argv);
  if (param.help) { cerr << HELP; exit(1); }

  BOWLOGINIT;

  // read BoWFile and output documents

  std::ifstream fileIn(param.inputFile.c_str(),std::ifstream::binary);
  if (! fileIn)
  {
    cerr << "cannot open input file [" << param.inputFile << "]" << endl;
    exit(1);
  }
  BoWBinaryReader reader;
  try
  {
    reader.readHeader(fileIn);
  }
  catch (exception& e)
  {
    cerr << "Error: " << e.what() << endl;
    exit(1);
  }

  switch (reader.getFileType())  {

  case BOWFILE_TEXT: {

    LINFO << "ReadBoWFile: file contains a BoWText" << LENDL;
    BoWText text;
    reader.readBoWText(fileIn,text);

    switch (param.outputFormat) {
    case XML: {
      BoWXMLWriter writer(cout);
      writer.writeBoWText(&text,
                          param.useIterator,
                          param.useIndexIterator);
      break;
    }
    case BOWFILE_NOTYPE: {
      if (param.useIterator) {
        BoWTokenIterator it(text);
        while (! it.isAtEnd()) {
          cout << it.getElement()->getOutputUTF8String() << endl;
          it++;
        }
      }
      else if (param.useIndexIterator) {
        LINFO << "ReadBoWFile: call IndexElementIterator with maxCompoundSize=" << param.maxCompoundSize << LENDL;
        IndexElementIterator it(text,0,param.maxCompoundSize);
        while (! it.isAtEnd()) {
          cout << it.getElement() << endl;
          it++;
        }
      }
      else {
        cout << text << endl;
      }
      break;
    }
    default: cerr << "Error: output format not handled" << endl;
    }
    break;
  }
  case BOWFILE_SDOCUMENT:
  {
    LINFO << "ReadBoWFile: file contains a StructuredBoWDocument" << LENDL;
    BoWDocument* document=new BoWDocument();
    try
    {
      readSDocuments(fileIn, document, reader);
    }
    catch (exception& e) { cerr << "Error: " << e.what() << endl; }
    fileIn.close();
    delete document;
    break;
  }
/*
  case BOWFILE_DOCUMENT: {
    cerr << "ReadBoWFile: file contains a BoWDocument" << endl;
    BoWDocument* document=new BoWDocument();
    try
      {
        BoWXMLWriter::getInstance().writeBoWDocumentsHeader(cout);
        readDocuments(fileIn,document);
        BoWXMLWriter::getInstance().writeBoWDocumentsFooter(cout);
      }
    catch (exception& e) { cerr << "Error: " << e.what() << endl; }
    fileIn.close();
    delete document;
    break;
  }
  case BOWFILE_DOCUMENTST: {
    cerr << "ReadBoWFile: file contains a BoWDocumentST" << endl;
    BoWDocument* document=new BoWDocumentST();
    try {
      BoWXMLWriter::getInstance().writeBoWDocumentsHeader(cout);
      readDocuments(fileIn,document);
      BoWXMLWriter::getInstance().writeBoWDocumentsFooter(cout);
    }
    catch (exception& e) { cerr << "Error: " << e.what() << endl; }
    fileIn.close();
    delete document;
    break;
  }
*/  
  default: {
    cerr << "format of file " << reader.getFileTypeString() << " not managed"
         << endl;
    exit(1);
  }
  }
  return EXIT_SUCCESS;
}