multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang) { const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO"); multimap<LimaString,string> result; // si un seul bowtoken on le prend // if (bowText.size()==1) // { // cerr << "- found only one norm : " << bowText.front()->getLemma() << endl; // result.push_back(bowText.front()->getLemma()); // } // sinon on prend tous les bowtoken qui vont du d�ut �la fin // else // { // cerr << "extractNormalisation : " << source << endl; for (BoWText::const_iterator bowItr=bowText.begin(); bowItr!=bowText.end(); bowItr++) { pair<int,int> posLen=getStartEnd(*bowItr); // cerr << " - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second; if ((posLen.first==1) && (posLen.second==int(source.size()+1))) { result.insert(make_pair( (*bowItr)->getLemma(), macroManager.getPropertySymbolicValue((*bowItr)->getCategory()))); // cerr << " keep it !"; } // cerr << endl; } // } return result; }
void BoWBinaryWriter::writeBoWText(std::ostream& file, const BoWText& bowText) const { BOWLOGINIT; Misc::writeCodedInt(file,bowText.size()); Misc::writeString(file,bowText.lang); LDEBUG << "BoWBinaryWriter::writeBoWText wrote lang file at: " << file.tellp(); uint64_t tokenCounter(0); // build reverse map to store in file numbers instead of pointers std::map<BoWToken*,uint64_t> refMap; for (BoWText::const_iterator it=bowText.begin(), it_end=bowText.end(); it!=it_end; it++) { refMap[(*it)]=tokenCounter; writeBoWToken(file,*it,refMap); tokenCounter++; } }