Пример #1
0
multimap<LimaString,string> extractNormalization(const LimaString& source,const BoWText& bowText,MediaId lang)
{
  const Common::PropertyCode::PropertyManager& macroManager = static_cast<const Common::MediaticData::LanguageData&>(MediaticData::single().mediaData(lang)).getPropertyCodeManager().getPropertyManager("MACRO");
  multimap<LimaString,string> result;
  // si un seul bowtoken on le prend
  //  if (bowText.size()==1)
  //  {
  //    cerr << "- found only one norm : " << bowText.front()->getLemma() << endl;
  //    result.push_back(bowText.front()->getLemma());
  //  }
  // sinon on prend tous les bowtoken qui vont du d�ut �la fin
  //  else
  //  {
  //    cerr << "extractNormalisation : " << source << endl;
  for (BoWText::const_iterator bowItr=bowText.begin();
       bowItr!=bowText.end();
       bowItr++)
  {
    pair<int,int> posLen=getStartEnd(*bowItr);
    //      cerr << "  - " << (*bowItr)->getLemma() << " at " << posLen.first << "," << posLen.second;
    if ((posLen.first==1) && (posLen.second==int(source.size()+1)))
    {
      result.insert(make_pair(
                      (*bowItr)->getLemma(),
                      macroManager.getPropertySymbolicValue((*bowItr)->getCategory())));
      //        cerr << " keep it !";
    }
    //      cerr << endl;
  }
  //   }
  return result;
}
Пример #2
0
void BoWBinaryWriter::writeBoWText(std::ostream& file,
             const BoWText& bowText) const
{
    BOWLOGINIT;
    Misc::writeCodedInt(file,bowText.size());
    Misc::writeString(file,bowText.lang);
    LDEBUG << "BoWBinaryWriter::writeBoWText wrote lang file at: " << file.tellp();
    uint64_t tokenCounter(0);
    // build reverse map to store in file numbers instead of pointers
    std::map<BoWToken*,uint64_t> refMap;
    for (BoWText::const_iterator it=bowText.begin(),
            it_end=bowText.end(); it!=it_end; it++) {
        refMap[(*it)]=tokenCounter;
        writeBoWToken(file,*it,refMap);
        tokenCounter++;
    }
}
Пример #3
0
void BagOfWordsTest3::test_BoWText()
{
  qDebug() << "BagOfWordsTest3::test_BoWText";
  QString bt1Lemma;

  std::stringstream stream;
  {
    boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5));
    bt1->setInflectedForm(QString::fromUtf8("lemma"));
    boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12));
    bt2->setInflectedForm(QString::fromUtf8("démocratique"));
    boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4));
    bt3->setInflectedForm(QString::fromUtf8("word"));

    boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4));
    bt4->setInflectedForm(QString::fromUtf8("autre"));

    bt1Lemma = bt1->getLemma();

    boost::shared_ptr< BoWTerm > term(new BoWTerm);
    term->addPart(bt1);
    boost::shared_ptr< BoWTerm > term2(new BoWTerm);
    term2->addPart(bt2);
    term2->addPart(bt3);
    term->addPart(term2);

    BoWText text;
    text.push_back(term);
    text.push_back(bt4);

    BoWBinaryWriter writer;
    writer.writeBoWText(stream, text);
  }

  BoWText text;
  BoWBinaryReader reader;
  reader.readBoWText(stream, text);

  QVERIFY( text.size() == 2 );
  boost::shared_ptr<BoWTerm> rterm = boost::dynamic_pointer_cast<BoWTerm>(text[0]);
  QVERIFY( rterm->getParts().size() == 2 );

  QVERIFY( bt1Lemma == rterm->getParts()[0].getBoWToken()->getLemma() );
}
Пример #4
0
// read documents in a file, using the document buffer given as argument
// (can be BoWDocument* or BoWDocumentST*)
void readDocuments(istream& fileIn, ostream& out)
{
  while (! fileIn.eof())
  {
    // enum type with small number of values coded on one byte;
    BoWBlocType blocType;
    fileIn >> blocType;
    if (blocType==DOCUMENT_PROPERTIES_BLOC)
    {
      Lima::Common::Misc::DocumentProperties props;
      props.read(fileIn);
      props.write(out);
      fileIn >> blocType;
    }
    BoWText text;
    while (blocType==BOW_TEXT_BLOC)
    {
      text.read(fileIn);
      if (fileIn.eof()) break;
      fileIn >> blocType;
    }
    text.write(out);
  }
Пример #5
0
void BagOfWordsTest3::test_BoWText2()
{
  qDebug() << "BagOfWordsTest3::test_BoWText2";
  QString name;

  std::stringstream stream;
  {
    boost::shared_ptr< BoWToken > bt1(new BoWToken(QString::fromUtf8("lemma"), 1, 10, 5));
    bt1->setInflectedForm(QString::fromUtf8("lemma"));
    boost::shared_ptr< BoWToken > bt2(new BoWToken(QString::fromUtf8("démocratique"), 1, 10, 12));
    bt2->setInflectedForm(QString::fromUtf8("démocratique"));
    boost::shared_ptr< BoWToken > bt3(new BoWToken(QString::fromUtf8("word"), 1, 10, 4));
    bt3->setInflectedForm(QString::fromUtf8("word"));

    boost::shared_ptr< BoWToken > bt4(new BoWToken(QString::fromUtf8("autre"), 1, 10, 4));
    bt4->setInflectedForm(QString::fromUtf8("autre"));

    boost::shared_ptr< BoWTerm > term(new BoWTerm);
    term->addPart(bt1);
    boost::shared_ptr< BoWTerm > term2(new BoWTerm);
    term2->addPart(bt2);
    term2->addPart(bt3);
    term->addPart(term2);

    boost::shared_ptr< BoWNamedEntity > ne(new BoWNamedEntity);
    ne->setNamedEntityType(EntityType(EntityTypeId(1),EntityGroupId(3)));
    ne->setFeature("surname","John");
    ne->setFeature("name","Doe");
    name = ne->getFeatures().at("name");
    boost::shared_ptr< BoWToken > btsurname(new BoWToken(QString::fromUtf8("John"), 1, 10, 5));
    bt1->setInflectedForm(QString::fromUtf8("John"));
    boost::shared_ptr< BoWToken > btname(new BoWToken(QString::fromUtf8("Doe"), 1, 10, 12));
    bt2->setInflectedForm(QString::fromUtf8("Doe"));
    ne->addPart(btsurname);
    ne->addPart(btname);

    BoWText text;
    text.push_back(term);
    text.push_back(bt4);
    text.push_back(ne);

    BoWBinaryWriter writer;
    writer.writeBoWText(stream, text);
  }

  BoWText text;
  BoWBinaryReader reader;
  reader.readBoWText(stream, text);

  QVERIFY( text.size() == 3 );
  boost::shared_ptr<BoWNamedEntity> rne = boost::dynamic_pointer_cast<BoWNamedEntity>(text[2]);
  QVERIFY( rne->getParts().size() == 2 );

  QVERIFY( name == rne->getFeatures().at("name") );
}