// read part of structured document from a file, using the document buffer given as argument void readSDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& reader) { switch (param.outputFormat) { case TEXT: { TextWriterBoWDocumentHandler writer(cout); while (! fileIn.eof()) { reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); } break; } case XML: { BoWXMLWriter writer(cout); writer.writeBoWDocumentsHeader(); while (! fileIn.eof()) { reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); } writer.writeBoWDocumentsFooter(); } case STAT: { SBoWStatWriter writer; while (! fileIn.eof()) { reader.readBoWDocumentBlock(fileIn, *document, writer, param.useIterator); } cout << writer << endl; break; } } }
void BagOfWordsTest3::test_BoWText2() { qDebug() << "BagOfWordsTest3::test_BoWText2"; QString name; std::stringstream stream; { boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5)); bt1->setInflectedForm(QString::fromUtf8("lemma")); boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12)); bt2->setInflectedForm(QString::fromUtf8("démocratique")); boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4)); bt3->setInflectedForm(QString::fromUtf8("word")); boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4)); bt4->setInflectedForm(QString::fromUtf8("autre")); boost::shared_ptr< BoWTerm > term(new BoWTerm); term->addPart(bt1); boost::shared_ptr< BoWTerm > term2(new BoWTerm); term2->addPart(bt2); term2->addPart(bt3); term->addPart(term2); boost::shared_ptr< BoWNamedEntity > ne(new BoWNamedEntity); ne->setNamedEntityType(EntityType(EntityTypeId(1),EntityGroupId(3))); ne->setFeature("surname","John"); ne->setFeature("name","Doe"); name = ne->getFeatures().at("name"); boost::shared_ptr< BoWToken > btsurname(new BoWToken(QString::fromUtf8("John"), 1, 10, 5)); bt1->setInflectedForm(QString::fromUtf8("John")); boost::shared_ptr< BoWToken > btname(new BoWToken(QString::fromUtf8("Doe"), 1, 10, 12)); bt2->setInflectedForm(QString::fromUtf8("Doe")); ne->addPart(btsurname); ne->addPart(btname); BoWText text; text.push_back(term); text.push_back(bt4); text.push_back(ne); BoWBinaryWriter writer; writer.writeBoWText(stream, text); } BoWText text; BoWBinaryReader reader; reader.readBoWText(stream, text); QVERIFY( text.size() == 3 ); boost::shared_ptr<BoWNamedEntity> rne = boost::dynamic_pointer_cast<BoWNamedEntity>(text[2]); QVERIFY( rne->getParts().size() == 2 ); QVERIFY( name == rne->getFeatures().at("name") ); }
//! @brief read a text Bow file and fills a lexicon //! @param fileIn the file to read //! @param reader the file reader //! @param lex the lexicon to fill //! @param propertyAccessor //! @param referenceProperties void readBowFileText(ifstream& fileIn, BoWBinaryReader& reader, Lexicon& lex, const PropertyAccessor& propertyAccessor, set<LinguisticCode>& referenceProperties ) { BoWText text; reader.readBoWText(fileIn,text); bool filterCategory = false; if ( referenceProperties.size() > 0 ) { filterCategory = true; } BoWTokenIterator it(text); while (! it.isAtEnd()) { const BoWToken& token = *(it.getElement()); if (filterCategory) { set<LinguisticCode>::const_iterator referencePropertyIt = referenceProperties.find(propertyAccessor.readValue(token.getCategory())); if ( referencePropertyIt != referenceProperties.end() ) { lex.add(getStringDecomp(&token),token.getIndexString()); } } else { lex.add(getStringDecomp(&token),token.getIndexString()); } it++; } }
void BowTextHandler::endAnalysis() { m_writer->endAnalysis(); istringstream in(m_bowstream->str()); BoWBinaryReader reader; reader.readHeader(in); reader.readBoWText(in,m_bowtext); // Insertion du bowtext au handler de contenu et de structure set_LastContentId(get_LastContentId()+1); CONTENT_ID contentId=get_LastContentId(); ContentHandler< BoWText >::addContent(contentId,m_bowtext); Structure structure(1); // TODO: replace BOW_TEXT_NAME2 with some consistent value std::string BOW_TEXT_NAME2("BOW_TEXT_NAME2"); Node node(BOW_TEXT_NAME2,1,1,contentId,2, "" , "text" , "bow" , "bow" , 0 , 1 , 0 , 0 , 0 , 0 , 0 , 0 ); add_Node(node); }
void BagOfWordsTest3::test_BoWText() { qDebug() << "BagOfWordsTest3::test_BoWText"; QString bt1Lemma; std::stringstream stream; { boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5)); bt1->setInflectedForm(QString::fromUtf8("lemma")); boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12)); bt2->setInflectedForm(QString::fromUtf8("démocratique")); boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4)); bt3->setInflectedForm(QString::fromUtf8("word")); boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4)); bt4->setInflectedForm(QString::fromUtf8("autre")); bt1Lemma = bt1->getLemma(); boost::shared_ptr< BoWTerm > term(new BoWTerm); term->addPart(bt1); boost::shared_ptr< BoWTerm > term2(new BoWTerm); term2->addPart(bt2); term2->addPart(bt3); term->addPart(term2); BoWText text; text.push_back(term); text.push_back(bt4); BoWBinaryWriter writer; writer.writeBoWText(stream, text); } BoWText text; BoWBinaryReader reader; reader.readBoWText(stream, text); QVERIFY( text.size() == 2 ); boost::shared_ptr<BoWTerm> rterm = boost::dynamic_pointer_cast<BoWTerm>(text[0]); QVERIFY( rterm->getParts().size() == 2 ); QVERIFY( bt1Lemma == rterm->getParts()[0].getBoWToken()->getLemma() ); }
// read documents in a file, using the document buffer given as argument // (can be BoWDocument* or BoWDocumentST*) void readDocuments(ifstream& fileIn, BoWDocument* document, BoWBinaryReader& reader, Lexicon& lex, const PropertyManager& macroManager, const PropertyAccessor& propertyAccessor, set<LinguisticCode>& referenceProperties ) { bool filterCategory = false; // LinguisticCode referenceProperty; if ( referenceProperties.size() > 0 ) { // referenceProperty = *(referenceProperties.begin()); filterCategory = true; } GetLexiconBoWDocumentHandler handler(lex,macroManager,propertyAccessor, referenceProperties,filterCategory); while (! fileIn.eof()) { reader.readBoWDocumentBlock(fileIn,*document,handler,true); } }
//********************************************************************** // // M A I N // //********************************************************************** int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); QsLogging::initQsLog(); if (argc<1) { cerr << USAGE; return EXIT_FAILURE; } QsLogging::initQsLog(); readCommandLineArguments(argc,argv); if (param.help) { cerr << HELP; return EXIT_FAILURE; } string resourcesPath=getenv("LIMA_RESOURCES")==0?"/usr/share/apps/lima/resources":string(getenv("LIMA_RESOURCES")); string configDir=getenv("LIMA_CONF")==0?"/usr/share/config/lima":string(getenv("LIMA_CONF")); if ( (!param.language.size()) && (!param.codeFile.size()) ) { cerr << "no codefile nor language specified !" << endl; cerr << "Use e.g option '-l fre'." << endl; cerr << "Option '-h' gives full help" << endl; return EXIT_FAILURE; } else if ( param.language.size() ) { param.codeFile=resourcesPath+"/LinguisticProcessings/"+param.language+"/code-"+param.language+".xml"; } cerr << "read proccodeManager from file " << param.codeFile << "..." << endl; PropertyCodeManager propcodemanager; propcodemanager.readFromXmlFile(param.codeFile); cerr << "get macroManager..." << endl; const PropertyManager& macroManager = propcodemanager.getPropertyManager("MACRO"); const PropertyAccessor& propertyAccessor = macroManager.getPropertyAccessor(); set<LinguisticCode> referenceProperties; for ( std::vector<string>::const_iterator macro = param.macro.begin() ; macro != param.macro.end() ; macro++ ) { cerr << "referenceProperties.insert(" << *macro << ")" << endl; LinguisticCode referenceProperty = macroManager.getPropertyValue(*macro); referenceProperties.insert(referenceProperty); } cerr << "referencePropertySet= "; set<LinguisticCode>::iterator propIt = referenceProperties.begin(); if ( propIt != referenceProperties.end() ) { const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt); cerr << symbol; propIt++; } for ( ; propIt != referenceProperties.end() ; propIt++ ) { const std::string& symbol = macroManager.getPropertySymbolicValue(*propIt); cerr << ", " << symbol; } cerr << endl; Lexicon lex; // read all files and count terms vector<string>::const_iterator file=param.inputFiles.begin(), file_end=param.inputFiles.end(); for (;file!=file_end; file++) { ifstream fileIn((*file).c_str(), std::ifstream::binary); if (! fileIn) { cerr << "cannot open input file [" << *file << "]" << endl; continue; } BoWBinaryReader reader; try { reader.readHeader(fileIn); } catch (exception& e) { cerr << "Error: " << e.what() << endl; return EXIT_FAILURE; } switch (reader.getFileType()) { case BOWFILE_TEXT: { cerr << "Build lexicon from BoWText [" << *file << "]" << endl; try { readBowFileText(fileIn,reader, lex, propertyAccessor, referenceProperties); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } break; } case BOWFILE_DOCUMENTST: { cerr << "ReadBoWFile: file contains a BoWDocumentST -> not treated" << endl; } case BOWFILE_DOCUMENT: { cerr << "ReadBoWFile: build BoWdocument from " << *file<< endl; BoWDocument* document=new BoWDocument(); try { cerr << "ReadBoWFile: extract terms... " << endl; readDocuments(fileIn,document,reader, lex, macroManager, propertyAccessor, referenceProperties); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } fileIn.close(); delete document; break; } default: { cerr << "format of file " << reader.getFileTypeString() << " not managed" << endl; return EXIT_FAILURE; } } } // output stream (default is 'cout') std::ostream *s_out; // Manage output if ( param.outputFilename.length() == 0) s_out=&std::cout; else s_out = new std::ofstream(param.outputFilename.c_str(), std::ios_base::out | std::ios_base::binary | std::ios_base::trunc); // output lexicon Lexicon::const_iterator w=lex.begin(), w_end=lex.end(); for (;w!=w_end; w++) { (*s_out) << Common::Misc::limastring2utf8stdstring((*w).second.second) << "|" << Common::Misc::limastring2utf8stdstring((*w).first) << "|" << (*w).second.first << endl; } // Close output file (if any) if ( param.outputFilename.length() != 0) dynamic_cast<std::ofstream*>(s_out)->close(); return EXIT_SUCCESS; }
//********************************************************************** // // M A I N // //********************************************************************** int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); QsLogging::initQsLog(); if (argc<1) { cerr << USAGE; exit(1); } readCommandLineArguments(argc,argv); if (param.help) { cerr << HELP; exit(1); } BOWLOGINIT; // read BoWFile and output documents std::ifstream fileIn(param.inputFile.c_str(),std::ifstream::binary); if (! fileIn) { cerr << "cannot open input file [" << param.inputFile << "]" << endl; exit(1); } BoWBinaryReader reader; try { reader.readHeader(fileIn); } catch (exception& e) { cerr << "Error: " << e.what() << endl; exit(1); } switch (reader.getFileType()) { case BOWFILE_TEXT: { LINFO << "ReadBoWFile: file contains a BoWText" << LENDL; BoWText text; reader.readBoWText(fileIn,text); switch (param.outputFormat) { case XML: { BoWXMLWriter writer(cout); writer.writeBoWText(&text, param.useIterator, param.useIndexIterator); break; } case BOWFILE_NOTYPE: { if (param.useIterator) { BoWTokenIterator it(text); while (! it.isAtEnd()) { cout << it.getElement()->getOutputUTF8String() << endl; it++; } } else if (param.useIndexIterator) { LINFO << "ReadBoWFile: call IndexElementIterator with maxCompoundSize=" << param.maxCompoundSize << LENDL; IndexElementIterator it(text,0,param.maxCompoundSize); while (! it.isAtEnd()) { cout << it.getElement() << endl; it++; } } else { cout << text << endl; } break; } default: cerr << "Error: output format not handled" << endl; } break; } case BOWFILE_SDOCUMENT: { LINFO << "ReadBoWFile: file contains a StructuredBoWDocument" << LENDL; BoWDocument* document=new BoWDocument(); try { readSDocuments(fileIn, document, reader); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } fileIn.close(); delete document; break; } /* case BOWFILE_DOCUMENT: { cerr << "ReadBoWFile: file contains a BoWDocument" << endl; BoWDocument* document=new BoWDocument(); try { BoWXMLWriter::getInstance().writeBoWDocumentsHeader(cout); readDocuments(fileIn,document); BoWXMLWriter::getInstance().writeBoWDocumentsFooter(cout); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } fileIn.close(); delete document; break; } case BOWFILE_DOCUMENTST: { cerr << "ReadBoWFile: file contains a BoWDocumentST" << endl; BoWDocument* document=new BoWDocumentST(); try { BoWXMLWriter::getInstance().writeBoWDocumentsHeader(cout); readDocuments(fileIn,document); BoWXMLWriter::getInstance().writeBoWDocumentsFooter(cout); } catch (exception& e) { cerr << "Error: " << e.what() << endl; } fileIn.close(); delete document; break; } */ default: { cerr << "format of file " << reader.getFileTypeString() << " not managed" << endl; exit(1); } } return EXIT_SUCCESS; }