Ejemplo n.º 1
0
std::string EventParagraph::toString(std::string parentURI, uint64_t index, bool main) const
{
  LIMA_UNUSED(parentURI);
  LIMA_UNUSED(index);
  LIMA_UNUSED(main);
  std::ostringstream oss;
  oss << "<rdf:rdf xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\""
      << " xmlns:dc=\"http://purl.org/dc/elements/1.1/\"" << std::endl;
  oss << "<dc:creator>CEA LIST Specific entities extraction service</dc:creator>";
  oss << "</rdf:rdf>";
//   return oss.str();
  
  std::string out;
  for(std::vector<Entity *>::const_iterator iT=m_evententities.second.begin();iT!=m_evententities.second.end();iT++)
  {
    
  }
  
  for(std::map<Common::MediaticData::EntityType,std::vector<Entity *> >::const_iterator iTm=m_otherentities.begin();iTm!=m_otherentities.end();iTm++)
  {
    
    for(std::vector<Entity *>::const_iterator iT=(*iTm).second.begin();iT!=(*iTm).second.end();iT++)
    {
      
    }
  }
  return out;
}
Ejemplo n.º 2
0
bool XMLPropertyHandler::startElement(const QString & namespaceURI, const QString & name, const QString & qName, const QXmlAttributes & attributes)
{
    LIMA_UNUSED(namespaceURI);
    LIMA_UNUSED(qName);
    PROPERTYCODELOGINIT;
    const QString& stringName = name;
    if (stringName == "property")
    {
        m_currentProp=PROP;
        m_properties.push_back(PropertyDescription());
        m_properties.back().name = attributes.value("name").toUtf8().data();
        LDEBUG << "read property " << m_properties.back().name;
    }
    else if (stringName == "subproperty")
    {
        m_currentProp=SUBPROP;
        m_subproperties.push_back(SubPropertyDescription());
        m_subproperties.back().name = attributes.value("name").toUtf8().data();
        m_subproperties.back().parentName = attributes.value("parent").toUtf8().data();
        LDEBUG << "read subproperty " << m_subproperties.back().name << " of parent property " << m_subproperties.back().parentName;
    }
    else if (stringName == "value")
    {
        string value=attributes.value("name").toUtf8().data();
        LDEBUG << "read value " << value;
        if (m_currentProp == PROP)
        {
            m_properties.back().values.push_back(value);
        }
        else if (m_currentProp == SUBPROP)
        {
            m_subproperties.back().values.back().second.push_back(value);
        }
        else
        {
            LERROR << "Don't know what to do with value " << value << " !";
        }
    }
    else if (stringName == "subvalues")
    {
        string value=attributes.value("value").toUtf8().data();
        LDEBUG << "read subvalues " << value;
        if (m_currentProp == SUBPROP)
        {
            m_subproperties.back().values.push_back(make_pair(value,vector<string>()));
        }
        else
        {
            LERROR << "Don't know what to do with subvalues " << value << " !";
        }
    }
    return true;
}
Ejemplo n.º 3
0
bool ExampleLoader::XMLHandler::startElement(const QString & namespaceURI, const QString & eltName, const QString & qName, const QXmlAttributes & attributes)
{
  LIMA_UNUSED(namespaceURI);
  LIMA_UNUSED(qName);
  //PROCESSORSLOGINIT;
  //LDEBUG << "ExampleLoader::XMLHandler start element "  << eltName;

  if (eltName=="w")
  {
    LinguisticCode posInt=static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertyValue(attributes.value("pos").toStdString());
    m_tagIndex[attributes.value("p").toInt()] = posInt;
  }
  return true;
}
Ejemplo n.º 4
0
bool XMLPropertyHandler::endElement(const QString & namespaceURI, const QString & name, const QString & qName)
{
  LIMA_UNUSED(namespaceURI);
  LIMA_UNUSED(qName);
  const QString& stringName = name;
  if (m_currentProp==PROP && stringName == "property")
  {
    m_currentProp=NONE;
  }
  else if (m_currentProp==SUBPROP && stringName == "subproperty")
  {
    m_currentProp=NONE;
  }
  return true;
}
Ejemplo n.º 5
0
/** @addtogroup ResourceConfiguration
 * - <b>&lt;group name="..." class="SentenceBoundsFinder"&gt;</b>
 */
void StopList::init(
  Common::XMLConfigurationFiles::GroupConfigurationStructure& unitConfiguration,
                     Manager* manager)

{
  LIMA_UNUSED(manager);
  DUMPERLOGINIT;
  const string& resourcesPath=Common::MediaticData::MediaticData::single().getResourcesPath();
  string stopListFileName;
  try
  {
    stopListFileName=resourcesPath+"/"+unitConfiguration.getParamsValueAtKey("file");
  }
  catch (Common::XMLConfigurationFiles::NoSuchParam& )
  {
    LERROR << "No param 'file' in StopList configuration group ! ";
    throw InvalidConfiguration();
  }

  std::ifstream stopListFile(stopListFileName.c_str(), std::ifstream::binary);
  if (!stopListFile) {
    LERROR << "invalid file " << stopListFileName;
    throw InvalidConfiguration();
  }
  
  LimaString wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile));
  LDEBUG << "Loading stop list file: " << stopListFileName;
  while (!wword.isEmpty())
  {
    insert(wword);
    wword = Common::Misc::utf8stdstring2limastring(Common::Misc::readLine(stopListFile));
  }
}
Ejemplo n.º 6
0
bool SpecificEntitiesLoader::XMLHandler::endElement(const QString & namespaceURI, const QString & eltName, const QString & qName)
{
  LIMA_UNUSED(namespaceURI);
  LIMA_UNUSED(qName);
  //LOGINIT("LP::SpecificEntities");
  //LDEBUG << "SpecificEntitiesLoader::XMLHandler end element "  << toString(eltName);
  string name=toString(eltName);
  if (name=="specific_entity") {
    LOGINIT("LP::SpecificEntities");
    LDEBUG << "SpecificEntitiesLoader::XMLHandler add SE "  << m_type << "," << m_position << "," << m_length << "," << m_graph;
    addSpecificEntity(m_analysis, m_graph, m_string, m_type, m_position, m_length);
  }
  // no more current element
  m_currentElement="";
  return true;
}
Ejemplo n.º 7
0
void TextFormater::putTag(std::ostream& status, std::ostream& output, bool wide,
    const std::string& endSentenceTag, std::ostringstream& tag, std::ostringstream& tagValue) const
{
  LIMA_UNUSED(status);
  LIMA_UNUSED(wide);
    LILOGINIT;    
    std::string stag = tag.str();
    if ( (endSentenceTag != "")
            && (tagValue.str().find(endSentenceTag,0) != std::string::npos)
            && (stag.length() > 0) )
        stag[0] = '.';
    output << stag;
    tag.str("");
    tagValue.str("");
    
}
Ejemplo n.º 8
0
bool SpecificEntitiesLoader::XMLHandler::
startElement(const QString & namespaceURI, const QString & eltName, const QString & qName, const QXmlAttributes & attributes)
{
  LIMA_UNUSED(namespaceURI);
  LIMA_UNUSED(qName);
  LIMA_UNUSED(attributes);
  //LOGINIT("LP::SpecificEntities");
  //LDEBUG << "SpecificEntitiesLoader::XMLHandler start element "  << toString(eltName);
  m_currentElement=toString(eltName);

  if (m_currentElement=="specific_entity") { // clear stored values
    m_string="";
    m_type="";
    m_position=0;
    m_length=0;
  }
  return true;
}
Ejemplo n.º 9
0
void TextFormater::putWhites(std::ostream& status, std::ostream& output, bool wide,
    const std::string& endSentenceTag, std::ostringstream& tagValue, 
    std::ostream& carLuTarget, char carLu, char fillChar,
    uint64_t nb, uint64_t nbNewLines) const
{
  LIMA_UNUSED(status);
  LIMA_UNUSED(wide);
    std::string s;
    
    s = std::string(nb/2, fillChar);
    s.append(nbNewLines, '\n');
    if (nb%2 == 0) s.append(nb/2, fillChar);
    else s.append(nb/2 + 1, fillChar);
    if ( (endSentenceTag != "")
            && (tagValue.str().find(endSentenceTag,0) != std::string::npos)
            && (s.length() > 0) )
        s[0] = '.';
    output << s;
    nb = 0;
    nbNewLines = 0;
    carLuTarget << carLu;
}
Ejemplo n.º 10
0
std::string FeatureStoredData::
getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, 
         LinguisticGraphVertex v,
         AnalysisContent &analysis) const
{
  LIMA_UNUSED(analysis)
  Token* token=get(vertex_token,*(graph->getGraph()),v);
  if (token==0) {
    return "";
  }
  ostringstream oss;
  oss << token->position() ;
  return oss.str();
}
Ejemplo n.º 11
0
bool CreateSemanticRelation::
operator()(const LinguisticAnalysisStructure::AnalysisGraph& anagraph,
           const LinguisticGraphVertex& vertex1,
           const LinguisticGraphVertex& vertex2,
           AnalysisContent& analysis ) const
{
  LIMA_UNUSED(anagraph);
  SemanticRelationData * semanticData=static_cast<SemanticRelationData*>(analysis.getData("SemanticRelationData"));
  if (semanticData==0)
  {
    semanticData=new SemanticRelationData();
    analysis.setData("SemanticRelationData",semanticData);
    
  }
  
  return semanticData->relation(vertex1,vertex2,m_semanticRelationType);

}
Ejemplo n.º 12
0
/** This method replaces all the tags in the UTF-8 encoded input stream by spaces or underscores in the output stream.
Underscores are written when the tag is adjacent to a word (directly or by another) tag. Spaces are written otherwise.
If an unclosed tag finishes the input stream, the result is written but an error message is written in <I>status</I> and a non zero-value is returned.
If an opening '<' is found inside a tag, it is ignored (replaced) but an error message is written in <I>status</I> and a non zero-value will be returned.
If a closing '>' is found outside a tag, it is ignored (replaced) but an error message is written in <I>status</I> and a non zero-value will be returned.
This method does not modify its receiver and thus is constant.
Throws a runtime_error in case of an unknown internal state
It does not record the removed tags. Should be done in another method;

@param std::ostream& status the stream where errors and warning will be written
@param std::istream& input the input stream. Should contain a valid SGML file
@param std::ostream& output the output stream. The result stream where un-tagged results are written
@return int 0 if there is no error nor warning ; non-zero otherwise.
 */
int TextFormater::untaggingWithSpaces(std::ostream& status, std::istream& input,
        std::ostream& output, bool wide, 
        const std::string& endSentenceTag) const
{
  LIMA_UNUSED(wide);
    LILOGINIT;
    setlocale(LC_CTYPE,"fr_FR.UTF-8");
    
    size_t nb = 0; // the number of chars in the current tag when we cannot know
                            // if it is bond to the following text (tag after a white space)
    size_t nbNewLines = 0; // the number of newlines in the current tag when we cannot know
                            // if it is bond to the following text (tag after a white space)
    size_t position=0; // position in the input stream;
    enum RetVal {SUCCESS, INVALID_OPENING_TAG_CHAR, INVALID_CLOSING_TAG_CHAR,
                    DUPLICATED_OPENING_TAG_CHAR, DUPLICATED_CLOSING_TAG_CHAR,
                    UNCLOSED_OPENING_TAG_CHAR};
    RetVal retVal = SUCCESS;
    
    enum Etat {TEXT, DEBCOL, FINCOL, BLANC, DEBBLANC, FINBLANC, BEGENTITY, ENTITY};
    Etat etat = BLANC;

    char carLu;
    char carLu2; 
    std::string s;
    
    std::ostringstream txt;
    std::ostringstream tag;
    std::ostringstream tagValue;
    std::ostringstream entity;
    
    while (input.good())
    {
        input.get(carLu);
        LDEBUG << carLu;
        if (input.eof()) continue;
        switch (etat)
        {
            case TEXT:
                switch (carLu)
                {
                    case '<':
                        output << txt.str();
                        txt.str("");
                        tag << '_';
                        tag << ' ';
                        tagValue << carLu;
                        LDEBUG << "TEXT-> DEBCOL" << LENDL;
                        etat = DEBCOL;
                    break;
                    case '>':
                        txt << carLu;
                        status << "Invalid '>' character at " << position << std::endl;
                        retVal = INVALID_CLOSING_TAG_CHAR;
                    break;
                    case '&':
                        output << txt.str();
                        txt.str("");
                        entity << carLu;
                        LDEBUG << "TEXT -> BEGENTITY;" << LENDL;
                        etat = BEGENTITY;
                    break;
                    case ' ':case '\t':case '\n':
                        output << txt.str();;
                        output << carLu;
                        txt.str("");
                        LDEBUG << "TEXT-> BLANC" << LENDL;
                        etat = BLANC;
                    break;
                    default:
                        txt << carLu;
                    break;
                }
            break;
            case BEGENTITY:
                switch (carLu)
                {
                    case 'A':;case 'B':;case 'C':;case 'D':;case 'E':;case 'F':;case 'G':;
                    case 'H':;case 'I':;case 'J':;case 'K':;case 'L':;case 'M':;case 'N':;
                    case 'O':;case 'P':;case 'Q':;case 'R':;case 'S':;case 'T':;case 'U':;
                    case 'V':;case 'W':;case 'X':;case 'Y':;case 'Z':;
                    case 'a':;case 'b':;case 'c':;case 'd':;case 'e':;case 'f':;case 'g':;
                    case 'h':;case 'i':;case 'j':;case 'k':;case 'l':;case 'm':;case 'n':;
                    case 'o':;case 'p':;case 'q':;case 'r':;case 's':;case 't':;case 'u':;
                    case 'v':;case 'w':;case 'x':;case 'y':;case 'z':
                        entity << carLu;
                        LDEBUG << "BEGENTITY-> ENTITY" << LENDL;
                        etat = ENTITY;
                    break;
                    case '<':
                        output.put('_');
                        output.put(' ');
                        entity.str("");
                        tag << '_';
                        tag << ' ';
                        LDEBUG << "BEGENTITY-> DEBCOL" << LENDL;
                        etat = DEBCOL;
                    break;
                    case ' ':case '\t':case '\n':
//                        output.put('_');
                        output.put(' ');
                        output << carLu;
                        entity.str("");
                        LDEBUG << "BEGENTITY-> BLANC" << LENDL;
                        etat = BLANC;
                    break;
                    default:
//                        output.put('_');
                        output.put(' ');
                        output << carLu;
                        entity.str("");
                        LDEBUG << "BEGENTITY-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                }
            break;
            case ENTITY:
                switch (carLu)
                {
                    case 'A':;case 'B':;case 'C':;case 'D':;case 'E':;case 'F':;case 'G':;
                    case 'H':;case 'I':;case 'J':;case 'K':;case 'L':;case 'M':;case 'N':;
                    case 'O':;case 'P':;case 'Q':;case 'R':;case 'S':;case 'T':;case 'U':;
                    case 'V':;case 'W':;case 'X':;case 'Y':;case 'Z':;
                    case 'a':;case 'b':;case 'c':;case 'd':;case 'e':;case 'f':;case 'g':;
                    case 'h':;case 'i':;case 'j':;case 'k':;case 'l':;case 'm':;case 'n':;
                    case 'o':;case 'p':;case 'q':;case 'r':;case 's':;case 't':;case 'u':;
                    case 'v':;case 'w':;case 'x':;case 'y':;case 'z':
                        entity << carLu;
                    break;
                    case '<':
                        for (uint64_t i = 0; i < entity.str().size(); i++)
                        {
//                            output << '_';
                            output << ' ';
                        }
                        entity.str("");
//                        tag << '_';
                        tag << ' ';
                        LDEBUG << "ENTITY-> DEBCOL" << LENDL;
                        etat = DEBCOL;
                    break;
                    case ' ':case '\t':case '\n':
                        for (uint64_t i = 0; i < entity.str().size(); i++)
                        {
//                            output << '_'; 
                            output << ' '; 
                        }
                        output << carLu;
                        entity.str("");
                        LDEBUG << "ENTITY-> BLANC" << LENDL;
                        etat = BLANC;
                    break;
                    case ';':
                        for (uint64_t i = 0; i < entity.str().size()+1; i++)
                        {
//                            output << '_'; 
                            output << ' '; 
                        }
                        entity.str("");
                        LDEBUG << "ENTITY-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                    default:
                        for (uint64_t i = 0; i < entity.str().size(); i++)
                        {
//                            output << '_'; 
                            output << ' '; 
                        }
                        output << carLu;
                        entity.str("");
                        LDEBUG << "ENTITY-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                }
            break;
            case DEBCOL:
                tagValue << carLu;
                switch (carLu)
                {
                    case '<':
//                        tag << '_';
                        tag << ' ';
                        status << "Invalid '<' character at " << position << std::endl;
                        retVal = DUPLICATED_OPENING_TAG_CHAR;
                    break;
                    case '>':
//                        tag << '_';
                        tag << ' ';
                        LDEBUG << "DEBCOL-> FINCOL" << LENDL;
                        etat = FINCOL;
                    break;
                    case ' ':case '\t':case '\n':
                        tag << carLu;
                    break;
                    default:
                        LDEBUG << "Looking at " << carLu << LENDL;
                        char buf[MB_LEN_MAX];
                        buf[0] = carLu;
                        input.rdbuf()-> sgetn(buf+1, 9);
                        wchar_t mbc;
                        int transRes = mbtowc(&mbc, buf, MB_LEN_MAX);
                        LDEBUG << "transres value is " << transRes << LENDL;
                        if (transRes > 1)
                        {
                            LDEBUG << "Got a multibyte char inside tag: " << mbc << LENDL;
                            for (int i = 1; i < transRes; i++)
                            {
                                input.get(carLu);
                            }
                        }
//                        tag << '_';
                        tag << ' ';
                    break;
                }
            break;
            case FINCOL:
                switch (carLu)
                {
                    case '<':
//                        tag << '_';
                        tag << ' ';
                        tagValue << carLu;
                        LDEBUG << "FINCOL-> DEBCOL" << LENDL;
                        etat = DEBCOL;
                    break;
                    case '>':
//                        tag << '_';
                        tag << ' ';
                        tagValue << carLu;
                        status << "Invalid '>' character at " << position << std::endl;
                        retVal = DUPLICATED_CLOSING_TAG_CHAR;
                    break;
                    case ' ':case '\t':case '\n':
                        putTag(status, output, wide, endSentenceTag, tag, tagValue);
                        output << carLu;
                        LDEBUG << "FINCOL-> BLANC" << LENDL;
                        etat = BLANC;
                    break;
                    case '&':
                        putTag(status, output, wide, endSentenceTag, tag, tagValue);
                        entity << carLu;
                        LDEBUG << "FINCOL -> BEGENTITY;" << LENDL;
                        etat = BEGENTITY;
                    break;
                    default:
                        putTag(status, output, wide, endSentenceTag, tag, tagValue);
                        txt << carLu;
                        tagValue << carLu;
                        LDEBUG << "FINCOL-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                }
            break;
            case BLANC:
                switch (carLu)
                {
                    case '&':
                        entity << carLu;
                        LDEBUG << "BLANC -> BEGENTITY;" << LENDL;
                        etat = BEGENTITY;
                    break;
                    case '<':
                        nb = 1;
                        nbNewLines = 0;
                        tagValue << carLu;
                        LDEBUG << "BLANC-> DEBBLANC" << LENDL;
                        etat = DEBBLANC;
                    break;
                    case '>':
                        output << '>';
                        status << "Invalid '>' character at " << position << std::endl;
                        retVal = INVALID_CLOSING_TAG_CHAR;
                        LDEBUG << "BLANC-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                    case ' ':case '\t':case '\n':
                        output << carLu;
                    break;
                    default:
                        txt << carLu;
                        LDEBUG << "BLANC-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                }
            break;
            case DEBBLANC:
                tagValue << carLu;
                switch (carLu)
                {
                    case '<':
                        nb++;
                        status << "Duplicated '<' character at " << position << std::endl;
                        retVal = DUPLICATED_OPENING_TAG_CHAR;
                    break;
                    case '>':
                        nb++;
                        LDEBUG << "DEBBLANC-> FINBLANC" << LENDL;
                        etat = FINBLANC;
                    break;
                    case '\n':
                        nbNewLines++;
                    break;
                    default:
                        LDEBUG << "Looking at " << carLu << LENDL;
                        char buf[MB_LEN_MAX];
                        buf[0] = carLu;
                        std::streamsize got = input.rdbuf()-> sgetn(buf+1, 9);
                        for (std::streamsize i = 0; i < got ; i++)
                            input.rdbuf()-> sungetc();
                        for (uint64_t i = 0; i<MB_LEN_MAX; i++)
                            LDEBUG << buf[i];
                        LDEBUG << LENDL;
                        wchar_t mbc;
                        int transRes = mbtowc(&mbc, buf, MB_LEN_MAX);
                        LDEBUG << "transres  is " << transRes << LENDL;
                        if (transRes > 1)
                        {
                            LDEBUG << "Got a multibyte char inside tag: " << mbc << LENDL;
                            for (int i = 1; i < transRes; i++)
                            {
                                input.get(carLu);
                            }
                        }
                        nb++;
                    break;
                }
            break;
            case FINBLANC:
                switch (carLu)
                {
                    case '<':
                        nb++;
                        tagValue << carLu;
                        LDEBUG << "FINBLANC-> DEBBLANC" << LENDL;
                        etat = DEBBLANC;
                    break;
                    case '>':
                        status << "Duplicated '>' character at " << position << std::endl;
                        retVal = DUPLICATED_CLOSING_TAG_CHAR;
                        putWhites(status, output, wide,endSentenceTag, tagValue,
                                output, carLu, ' ', nb, nbNewLines);
                        LDEBUG << "FINBLANC-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                    case ' ':case '\t':case '\n':
                        putWhites(status, output, wide,endSentenceTag, tagValue,
                                output, carLu, ' ', nb, nbNewLines);
                        LDEBUG << "FINBLANC-> BLANC" << LENDL;
                        etat = BLANC;
                    break;
                    case '&':
//                        putWhites(status, output, wide,endSentenceTag, tagValue,
//                                entity, carLu, '_', nb, nbNewLines);
                        putWhites(status, output, wide,endSentenceTag, tagValue,
                                entity, carLu, ' ', nb, nbNewLines);
                        LDEBUG << "FINBLANC -> BEGENTITY;" << LENDL;
                        etat = BEGENTITY;
                    break;
                    default:
//                        putWhites(status, output, wide,endSentenceTag, tagValue,
//                                txt, carLu, '_', nb, nbNewLines);
                        putWhites(status, output, wide,endSentenceTag, tagValue,
                                txt, carLu, ' ', nb, nbNewLines);
                        LDEBUG << "FINBLANC-> TEXT" << LENDL;
                        etat = TEXT;
                    break;
                }
            break;
            default:
                throw std::runtime_error((std::string("unknown state %d.\n", int(etat))).c_str());
        }
        ++position;
    }
    if ( (etat == DEBCOL) || (etat == DEBBLANC) )
    {
        status << "Unclosed tag at EOF (" << position << ")" << std::endl;
        retVal = UNCLOSED_OPENING_TAG_CHAR;
        if (etat == DEBCOL) 
        {
          output << tag.str();
        }
        else
        {
//                s = std::string(nb/2, '_');
            s = std::string(nb/2, ' ');
            s.append(nbNewLines, '\n');
//                if (nb%2 == 0) s.append(nb/2, '_');
            if (nb%2 == 0) s.append(nb/2, ' ');
//                else s.append(nb/2 + 1, '_');
            else s.append(nb/2 + 1, ' ');
            output << s;
            nb = 0;
            nbNewLines = 0;
        }
    }

    else if ((etat == FINCOL) || (etat == FINBLANC))
    {
      output << tag.str();
    }

    else if (etat == TEXT)
    {
      output << txt.str();
    }
    else {} // BLANC nothing to do

    return int(retVal);
}