Пример #1
0
multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang)
{
  const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO");
  multimap<LimaString,string> result;
  // si un seul bowtoken on le prend
  //  if (bowText.size()==1)
  //  {
  //    cerr << "- found only one norm : " << bowText.front()->getLemma() << endl;
  //    result.push_back(bowText.front()->getLemma());
  //  }
  // sinon on prend tous les bowtoken qui vont du d�ut �la fin
  //  else
  //  {
  //    cerr << "extractNormalisation : " << source << endl;
  for (BoWText::const_iterator bowItr=bowText.begin();
       bowItr!=bowText.end();
       bowItr++)
  {
    pair<int,int> posLen=getStartEnd(*bowItr);
    //      cerr << "  - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second;
    if ((posLen.first==1) && (posLen.second==int(source.size()+1)))
    {
      result.insert(make_pair(
                      (*bowItr)->getLemma(),
                      macroManager.getPropertySymbolicValue((*bowItr)->getCategory())));
      //        cerr << " keep it !";
    }
    //      cerr << endl;
  }
  //   }
  return result;
}
Пример #2
0
void BoWBinaryWriter::writeBoWText(std::ostream& file,
             const BoWText& bowText) const
{
    BOWLOGINIT;
    Misc::writeCodedInt(file,bowText.size());
    Misc::writeString(file,bowText.lang);
    LDEBUG << "BoWBinaryWriter::writeBoWText wrote lang file at: " << file.tellp();
    uint64_t tokenCounter(0);
    // build reverse map to store in file numbers instead of pointers
    std::map<BoWToken*,uint64_t> refMap;
    for (BoWText::const_iterator it=bowText.begin(),
            it_end=bowText.end(); it!=it_end; it++) {
        refMap[(*it)]=tokenCounter;
        writeBoWToken(file,*it,refMap);
        tokenCounter++;
    }
}