multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang) { const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO"); multimap<LimaString,string> result; // si un seul bowtoken on le prend // if (bowText.size()==1) // { // cerr << "- found only one norm : " << bowText.front()->getLemma() << endl; // result.push_back(bowText.front()->getLemma()); // } // sinon on prend tous les bowtoken qui vont du d�ut �la fin // else // { // cerr << "extractNormalisation : " << source << endl; for (BoWText::const_iterator bowItr=bowText.begin(); bowItr!=bowText.end(); bowItr++) { pair<int,int> posLen=getStartEnd(*bowItr); // cerr << " - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second; if ((posLen.first==1) && (posLen.second==int(source.size()+1))) { result.insert(make_pair( (*bowItr)->getLemma(), macroManager.getPropertySymbolicValue((*bowItr)->getCategory()))); // cerr << " keep it !"; } // cerr << endl; } // } return result; }
void BoWBinaryWriter::writeBoWText(std::ostream& file, const BoWText& bowText) const { BOWLOGINIT; Misc::writeCodedInt(file,bowText.size()); Misc::writeString(file,bowText.lang); LDEBUG << "BoWBinaryWriter::writeBoWText wrote lang file at: " << file.tellp(); uint64_t tokenCounter(0); // build reverse map to store in file numbers instead of pointers std::map<BoWToken*,uint64_t> refMap; for (BoWText::const_iterator it=bowText.begin(), it_end=bowText.end(); it!=it_end; it++) { refMap[(*it)]=tokenCounter; writeBoWToken(file,*it,refMap); tokenCounter++; } }
void BagOfWordsTest3::test_BoWText() { qDebug() << "BagOfWordsTest3::test_BoWText"; QString bt1Lemma; std::stringstream stream; { boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5)); bt1->setInflectedForm(QString::fromUtf8("lemma")); boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12)); bt2->setInflectedForm(QString::fromUtf8("démocratique")); boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4)); bt3->setInflectedForm(QString::fromUtf8("word")); boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4)); bt4->setInflectedForm(QString::fromUtf8("autre")); bt1Lemma = bt1->getLemma(); boost::shared_ptr< BoWTerm > term(new BoWTerm); term->addPart(bt1); boost::shared_ptr< BoWTerm > term2(new BoWTerm); term2->addPart(bt2); term2->addPart(bt3); term->addPart(term2); BoWText text; text.push_back(term); text.push_back(bt4); BoWBinaryWriter writer; writer.writeBoWText(stream, text); } BoWText text; BoWBinaryReader reader; reader.readBoWText(stream, text); QVERIFY( text.size() == 2 ); boost::shared_ptr<BoWTerm> rterm = boost::dynamic_pointer_cast<BoWTerm>(text[0]); QVERIFY( rterm->getParts().size() == 2 ); QVERIFY( bt1Lemma == rterm->getParts()[0].getBoWToken()->getLemma() ); }
// read documents in a file, using the document buffer given as argument // (can be BoWDocument* or BoWDocumentST*) void readDocuments(istream& fileIn, ostream& out) { while (! fileIn.eof()) { // enum type with small number of values coded on one byte; BoWBlocType blocType; fileIn >> blocType; if (blocType==DOCUMENT_PROPERTIES_BLOC) { Lima::Common::Misc::DocumentProperties props; props.read(fileIn); props.write(out); fileIn >> blocType; } BoWText text; while (blocType==BOW_TEXT_BLOC) { text.read(fileIn); if (fileIn.eof()) break; fileIn >> blocType; } text.write(out); }
void BagOfWordsTest3::test_BoWText2() { qDebug() << "BagOfWordsTest3::test_BoWText2"; QString name; std::stringstream stream; { boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5)); bt1->setInflectedForm(QString::fromUtf8("lemma")); boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12)); bt2->setInflectedForm(QString::fromUtf8("démocratique")); boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4)); bt3->setInflectedForm(QString::fromUtf8("word")); boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4)); bt4->setInflectedForm(QString::fromUtf8("autre")); boost::shared_ptr< BoWTerm > term(new BoWTerm); term->addPart(bt1); boost::shared_ptr< BoWTerm > term2(new BoWTerm); term2->addPart(bt2); term2->addPart(bt3); term->addPart(term2); boost::shared_ptr< BoWNamedEntity > ne(new BoWNamedEntity); ne->setNamedEntityType(EntityType(EntityTypeId(1),EntityGroupId(3))); ne->setFeature("surname","John"); ne->setFeature("name","Doe"); name = ne->getFeatures().at("name"); boost::shared_ptr< BoWToken > btsurname(new BoWToken(QString::fromUtf8("John"), 1, 10, 5)); bt1->setInflectedForm(QString::fromUtf8("John")); boost::shared_ptr< BoWToken > btname(new BoWToken(QString::fromUtf8("Doe"), 1, 10, 12)); bt2->setInflectedForm(QString::fromUtf8("Doe")); ne->addPart(btsurname); ne->addPart(btname); BoWText text; text.push_back(term); text.push_back(bt4); text.push_back(ne); BoWBinaryWriter writer; writer.writeBoWText(stream, text); } BoWText text; BoWBinaryReader reader; reader.readBoWText(stream, text); QVERIFY( text.size() == 3 ); boost::shared_ptr<BoWNamedEntity> rne = boost::dynamic_pointer_cast<BoWNamedEntity>(text[2]); QVERIFY( rne->getParts().size() == 2 ); QVERIFY( name == rne->getFeatures().at("name") ); }