示例#1
0
void DumpXMLAnnotationVisitor::examine_edge(LinguisticGraphEdge e,
                                     const LinguisticGraph& g)
{
  LinguisticGraphVertex v = target(e, g);
  // process
  if (m_ad->hasAnnotation(v, Common::Misc::utf8stdstring2limastring("WordSense")))
  {
    GenericAnnotation ga = (m_ad->annotation(v,utf8stdstring2limastring("WordSense")));
    Lima::LinguisticProcessing::WordSenseDisambiguation::WordSenseAnnotation wsa;
    try
    {
      wsa = ga.value<Lima::LinguisticProcessing::WordSenseDisambiguation::WordSenseAnnotation>();
      wsa.outputXml(m_ostream,g);
    }
    catch (const boost::bad_any_cast& e)
    {
      LOGINIT("WordSenseDisambiguator");
      LERROR << "non word sense annotation";
    }
  }
  else 
  {
    Token* token = get(vertex_token, g, v);
    if (token != 0)
    {
      std::string s = Common::Misc::limastring2utf8stdstring(token->stringForm());
      m_ostream << s;      
    }
  }
  m_ostream << " ";
}
示例#2
0
CoreXmlReaderClient::CoreXmlReaderClient(Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser &configuration) :
/*m_delegate(0),*/m_handler(0)
{
#ifdef DEBUG_LP
    XMLREADERCLIENTLOGINIT;
    LDEBUG << "CoreXmlReaderClient::CoreXmlReaderClient";
#endif
    ModuleConfigurationStructure &conf = configuration.getModuleConfiguration("lp-structuredXmlreaderclient");
    m_documentReader = new DocumentsReader::DocumentReader(conf);
    m_documentReader->setLinguisticXMLDocHandler(this);
    m_emptyTextChars = utf8stdstring2limastring(" \t\n");
}
示例#3
0
文件: Text.cpp 项目: FaizaGara/lima
// takes a token
LimaString Text::token()
{
  TOKENIZERLOGINIT;
  // Creates a new token
  uint64_t delta = _curPtr;
  if (m_text[_curPtr] >= 0xD800 || _curPtr == _debPtr)
  {
    delta++;
  }
  if (_debPtr >= m_text.size())
  {
    LERROR << "Empty token !";
    _debPtr = delta;
    _curSettings.reset();
    return utf8stdstring2limastring("");
  }
  LimaString str=m_text.mid( _debPtr, (delta-_debPtr));
  LDEBUG << "      Adding token '" << str << "'";
  StringsPoolIndex form=(*_stringsPool)[str];
  Token *tToken = new Token(form,str,_debPtr+1,(delta-_debPtr));
  if (tToken == 0) throw MemoryErrorException();
  // @todo: set default status here, according to structured status (alpha,numeric etc...)
  // instead of setting it at each change of status (setAlphaCapital, setNumeric etc...)
  tToken->setStatus(_curSettings);
//   LDEBUG << "      _curSettings is " << _curSettings.toString();
  LDEBUG << "      status is " << tToken->status().toString();
  // Adds on the path
  LinguisticGraphVertex newVx=add_vertex(*_tTokenGraph);
  put(vertex_token,*_tTokenGraph,newVx,tToken);
  put(vertex_data,*_tTokenGraph,newVx,new MorphoSyntacticData());
  add_edge(_currentVx,newVx,*_tTokenGraph);
  _currentVx=newVx;
  _debPtr = delta;
  _curSettings.reset();
  return str;
}
示例#4
0
void DumpXMLAnnotationVisitor::examine_edge(LinguisticGraphEdge e,
                                     const LinguisticGraph& g)
{
  COREFSOLVERLOGINIT;
  LDEBUG << "DumpXMLAnnotationVisitor::examine_edge";
  LinguisticGraphVertex v = target(e, g);
  // let process sentences like (...) have automatically tuned (...) where the graph has one token "have_tuned" with one branch "automatically" "tuned" and another one with the following of the sentence
  LinguisticGraphOutEdgeIt it, it_end;
  boost::tie(it, it_end) = boost::out_edges(v,g);  
   if (it == it_end) 
     return;
  // let process sentences where one tag has not been fully determined and there is still two (or more) tag options
  LinguisticGraphVertex v2 = target(m_lastEdge, g);
  if (v2==v)
    return;
  if (m_lastEdge!=LinguisticGraphEdge() && are_equivalent(e, v2, v, g))
    return;
  // begin
  // store this edge for the future tests
  if (get(vertex_token, g,v)!=0)
    m_lastEdge = e;
//   const FsaStringsPool& stringsPool= Common::MediaticData::MediaticData::single().stringsPool(m_language);
  Token* token = get(vertex_token, g, v);
  // processing of cases like "s'y introduire", tokenized as "y s'introduire"
  if (token != 0 && (token->stringForm() == "en" || token->stringForm() =="y"))
  {
    LinguisticGraphOutEdgeIt it, it_end;
    boost::tie(it, it_end) = boost::out_edges(v,g);  
    if (it != it_end)
    { 
      Token* t = get(vertex_token, g,target(*it, g));
      if (t!=0 && Common::Misc::limastring2utf8stdstring(t->stringForm()).substr(0,2)=="s'") 
      {
        m_ostream << "s'";
      }
    }
  }
  // process
  std::set< AnnotationGraphVertex > matches = m_ad->matches("PosGraph",v,"annot");
  if (matches.empty())
  {
    COREFSOLVERLOGINIT;
    LERROR << "DumpXMLAnnotationVisitor::examine_edge No annotation graph vertex matches PoS graph vertex " << v <<  ". This should not happen.";
    return;
  }
  AnnotationGraphVertex av = *matches.begin();
  
  
  
  if (m_ad->hasAnnotation(av, Common::Misc::utf8stdstring2limastring("Coreferent")))
  {
    GenericAnnotation ga = (m_ad->annotation(av,utf8stdstring2limastring("Coreferent")));
    Lima::LinguisticProcessing::Coreferences::CoreferentAnnotation ca;
    try
    {
      ca = ga.value<Lima::LinguisticProcessing::Coreferences::CoreferentAnnotation>();
      ca.outputXml(m_ostream,g,m_ad);
    }
    catch (const boost::bad_any_cast& )
    {
      COREFSOLVERLOGINIT;
      LERROR << "non coreferent annotation"<< LENDL;
    }
  }
  else 
  {
    Token* token = get(vertex_token, g, v);
    if (token != 0)
    {
      std::string s = Common::Misc::limastring2utf8stdstring(token->stringForm());
      // processing of cases like "s'y introduire", tokenized as "y s'introduire"    
      if (s.substr(0,2) == "s'")
      {
        Token* t = get(vertex_token,g,source(e, g));
        if (t!=0 && (Common::Misc::limastring2utf8stdstring(t->stringForm()).substr(0,2)=="en" || Common::Misc::limastring2utf8stdstring(t->stringForm()).substr(0,2)=="y"))
        {
          s = s.substr(2,s.size());
        }
      }
      // processing of cases like "le Canada a-t-il envisagé...", où le mot entre "a" et "envisagé" se retrouverait rejeté après "a_envisagé". Nécessaire de traiter car problématique pour l'évaluation quand il s'agit d'un pronom clitique comme dans ce cas-ci.
      std::string formerMemo = m_memo;
      match_results<std::string::const_iterator> what; 
      string::const_iterator start = s.begin();
      string::const_iterator end = s.end();
      if (regex_search(s, what, regex("_")))
      {
        m_memo = std::string(what[0].second,end) + " ";
        s = std::string(start,what[0].first);
      }
      else m_memo = "";

      m_ostream << formerMemo << s;
      if (token->status().isAlphaPossessive())
      {
        m_ostream << "'s "; 
      }
    }
  }
  m_ostream << " ";
}
      LimaStatusCode CorefSolvingNormalizedXmlLogger::process(
        AnalysisContent& analysis) const
      {
//         COREFSOLVERLOGINIT;
        TimeUtils::updateCurrentTime();
        AnnotationData* annotationData = static_cast<AnnotationData*>(analysis.getData("AnnotationData"));
        const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph)));

//         LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph());
        LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
        if (metadata == 0)
        {
          COREFSOLVERLOGINIT;
          LERROR << "no LinguisticMetaData ! abort" << LENDL;
          return MISSING_DATA;
        }

        ofstream out;
        if (!openLogFile(out,metadata->getMetaData("FileName")))
        {
          COREFSOLVERLOGINIT;
          LERROR << "Can't open log file " << LENDL;
          return UNKNOWN_ERROR;
        }

        out << "<coreferences>" << endl;


        //   LDEBUG << "CorefSolvingNormalizedXmlLogger on graph " << m_graph << LENDL;
        AnnotationGraphVertexIt itv, itv_end;
        boost::tie(itv, itv_end) = vertices(annotationData->getGraph());
        for (; itv != itv_end; itv++)
        {
          // process
          //LDEBUG << "CorefSolvingNormalizedXmlLogger on annotation vertex " << *itv << LENDL;
          if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent")))
            //if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent")))
          {
            CoreferentAnnotation* annot ;
            try
            {
              annot = annotationData->annotation(*itv,utf8stdstring2limastring("Coreferent"))
                      .pointerValue<CoreferentAnnotation>();
            }
            catch (const boost::bad_any_cast& )
            {
              COREFSOLVERLOGINIT;
              LERROR << "One annotation on vertex " << *itv << " you are trying to cast is not a Coreference; Coreference not logged" << LENDL;
              for (int i = 0; i < 19 ; i++)
              {
                LERROR << "annot "<< i << " : " << limastring2utf8stdstring(annotationData->annotationName(i)) << LENDL ;
              }
              continue;
            }
            LinguisticProcessing::LinguisticAnalysisStructure::Token* token = get(vertex_token, *graph.getGraph(), annot->morphVertex());
            if (token == 0)
            {
              COREFSOLVERLOGINIT;
              LERROR << "Vertex " << *itv << " has no entry in the analysis graph token map. This should not happen !!" << LENDL;
            }
            else
            {
              CoreferentAnnotation* antecedent;
//               bool hasAntecedent = false;
              AnnotationGraphOutEdgeIt it, it_end;
              boost::tie(it, it_end) = boost::out_edges(static_cast<AnnotationGraphVertex>(*itv), annotationData->getGraph());

              for (; it != it_end; it++)
              {
                if (annotationData->hasAnnotation(target(*it,annotationData->getGraph()),utf8stdstring2limastring("Coreferent")))
                {
                  try
                  {
                    antecedent = annotationData->annotation(target(*it, annotationData->getGraph()), utf8stdstring2limastring("Coreferent")).pointerValue<CoreferentAnnotation>();
//                     hasAntecedent = true;
                  }
                  catch (const boost::bad_any_cast& )
                  {
                    COREFSOLVERLOGINIT;
                    LERROR << "One annotation on vertex you are trying to cast resulting from an edge out of " << *itv << " is not a Coreference; Coreference not logged" << LENDL;
                    continue;
                  }
                }
              }
              out << "  <reference>\n"
              << "    <pos>" << get(vertex_token,*graph.getGraph(),annot->morphVertex())->position() << "</pos>\n"
              << "    <len>" << token->stringForm().length() << "</len>\n"
              << "    <string>"<< limastring2utf8stdstring(transcodeToXmlEntities(token->stringForm())) << "</string>\n"
              << "    <npId>" << annot->id() << "</npId>\n"
              << "    <posVertex>" << annot->morphVertex() << "</posVertex>\n";
              //if (hasAntecedent)
              if (false)
              {
                out << "    <npRef>" << antecedent->id() << "</npRef>\n";
                out << "    <refPosVertex>" << antecedent->morphVertex() << "</refPosVertex>\n";
              }
              out << "    <categ>" << annot->categ() << "</categ>\n"
                    << "  </reference>\n"
              << endl;
            }
          }
        }
        out << "</coreferences>" << endl;
        out.close();

        TimeUtils::logElapsedTime("CorefSolvingNormalizedXmlLogger");
        return SUCCESS_ID;

      }
示例#6
0
LimaStatusCode GeoDumper::process(
  AnalysisContent& analysis) const
{
  DUMPERLOGINIT;
  LDEBUG << "Process GeoDumper ";
  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
      LERROR << "GeoDumper::process: no LinguisticMetaData ! abort";
      return MISSING_DATA;
  }
  /*AnalysisHandlerContainer* handlerContainer=static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
  if (handlerContainer == 0) {
    LERROR << "GeoDumper::process: no handler in analysisContent ! abort";
    return MISSING_DATA;
  }*/

  Lima::Common::AnnotationGraphs::AnnotationData* annotationData = static_cast< Lima::Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData==0)
  {
    LERROR << "GeoDumper::process: no AnnotationData ! abort";
    return MISSING_DATA;
  }

  //AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(handlerContainer->getHandler());
  LDEBUG << "handler will be: " << m_handler;
  //MediaId langid = static_cast<const  Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia();
  AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
  AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler));
  if (handler==0)
  {
    LERROR << "GeoDumper::process: handler " << m_handler << " has not been given to the core client";
    return MISSING_DATA;
  }
  //handler->setOut(&std::cout);
  
  handler->startAnalysis();
  HandlerStreamBuf hsb(handler);
  std::ostream out(&hsb);

  map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition > categoriesMapping;

  AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData(m_graph));
  LinguisticGraph* graph=anagraph->getGraph();
  ltNormProperty sorter(m_propertyAccessor);
  //const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language);

  LinguisticGraphVertexIt vxItr,vxItrEnd;
  boost::tie(vxItr,vxItrEnd) = vertices(*graph);
  for (;vxItr!=vxItrEnd;vxItr++)
  {
    Token* ft=get(vertex_token,*graph,*vxItr);
    if( ft!=0)
    {
      std::pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> > element =categoriesMapping[ft];
      element.second.push_back(get(vertex_data,*graph,*vxItr));
      element.first=*vxItr;
      categoriesMapping[ft]=element;
    }
  }

  for (map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition >::const_iterator ftItr=categoriesMapping.begin();
       ftItr!=categoriesMapping.end();
       ftItr++)
  {

    Token* ft=ftItr->first;
    std::ostringstream os;
    // get position
    uint64_t position=ft->position() + metadata->getStartOffset();
    // get string
    std::string str=Common::Misc::limastring2utf8stdstring(ft->stringForm());
    // replace separator in string by '_'
    string::size_type sepLen=m_sep.size();
    string::size_type p=0;
    while ( (p = str.find(m_sep, p)) != string::npos ) {
      str.replace( p, sepLen, "_");
      p++;
    }
    
    // newlines (paragraphes) => print empty line
    //if (str=="\n") {
    //  os << str; continue;
    //}
    if (m_printPosition) {
      os << position << m_sep;
    }
    os << str << m_sep;
    // POS
    std::set<LinguisticCode> props;
    vector<MorphoSyntacticData*> vt=ftItr->second.second;
    for (vector<MorphoSyntacticData*>::const_iterator dataItr=vt.begin();
           dataItr!=vt.end();
           dataItr++)
    {
        MorphoSyntacticData* data=*dataItr;
        sort(data->begin(),data->end(),sorter);
        //StringsPoolIndex norm(0),curNorm(0);
        LinguisticCode prop(0);
    
        // output first 
        MorphoSyntacticData::const_iterator elemIt=data->begin(),elemIt_end=data->end();
        //norm=elemIt->normalizedForm;
        if(elemIt != elemIt_end)
        {
          prop=m_propertyAccessor->readValue(elemIt->properties);
          os << m_propertyManager->getPropertySymbolicValue(prop);
          props.insert(prop);
        
          // output rest, with separator
          for ( elemIt++; elemIt!=elemIt_end; elemIt++)
          {
            //curNorm=elemIt->normalizedForm;
            prop=m_propertyAccessor->readValue(elemIt->properties);
            //if ((curNorm != norm) || (curProp != prop)) {
            //  norm=curNorm;
            //  prop=curProp;
            if (props.find(prop)==props.end()) {
              os << m_sepPOS << m_propertyManager->getPropertySymbolicValue(prop);
              props.insert(prop);
            }
          }
        }
    }
    std::set< AnnotationGraphVertex > matches = annotationData->matches(anagraph->getGraphId(),(ftItr->second).first,"annot");
    if (annotationData->hasAnnotation(*matches.begin(),utf8stdstring2limastring("GeoEntity")))
    {
      os << m_sep;
      os << annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity"))
        .pointerValue<GeoEntityAnnotation>()->getPosition();
      os << m_sep;
      std::set<std::string> classes=annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity"))
        .pointerValue<GeoEntityAnnotation>()->getGeoClasses();
      for (std::set<std::string>::iterator iT = classes.begin(); iT!=classes.end();)
      {
       os << *iT;
       iT++;
       if (iT!=classes.end()) os << " "; 
       }
    }
    out << os.str();
    out << endl;
  }

  out.flush();
  handler->endAnalysis();

  return SUCCESS_ID;
}