Exemplo n.º 1
0
bool EntityGroupTransition::
compare(const LinguisticAnalysisStructure::AnalysisGraph& graph,
        const LinguisticGraphVertex& v,
        AnalysisContent& analysis,
        const LinguisticAnalysisStructure::Token* /*token*/,
        const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const
{
  // should compare to vertex ?
  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData==0) {
    AULOGINIT;
    LDEBUG << "EntityGroupTransition::compare: no annotation graph available !";
    return false;
  }

  // find annotationGraphVertex matching the vertex of the current graph
  std::set<AnnotationGraphVertex> matches = annotationData->matches(graph.getGraphId(), v, "annot");
  if (matches.empty())
  {
    AULOGINIT;
    LDEBUG << "annotation ("<<graph.getGraphId()<<", "<<v<<", \"annot\") available";
    return false;
  }
  AnnotationGraphVertex annotVertex = *(matches.begin());

  if (!annotationData->hasAnnotation(annotVertex, m_entityAnnotation))
  {
    AULOGINIT;
    LDEBUG << "EntityGroupTransition::compare: No " << m_entityAnnotation << " annotation available on " << v;
    return false;
  }
  
  const SpecificEntityAnnotation* se =
    annotationData->annotation(annotVertex, m_entityAnnotation).
    pointerValue<SpecificEntityAnnotation>();
  Common::MediaticData::EntityType type = se->getType();
  AULOGINIT;
  LDEBUG << "EntityGroupTransition::compare: type = " << type << ", groupId = " << type.getGroupId();
  LDEBUG << "EntityGroupTransition::compare: m_entityGroupId = " << m_entityGroupId;
  LDEBUG << "EntityGroupTransition::compare: tests m_entityGroupId == type.getGroupId() = " << (m_entityGroupId == type.getGroupId());
  return( m_entityGroupId == type.getGroupId() );
}
Exemplo n.º 2
0
LimaStatusCode SpecificEntitiesXmlLogger::process(
  AnalysisContent& analysis) const
{
  SELOGINIT;
  LDEBUG << "SpecificEntitiesXmlLogger::process";
  TimeUtils::updateCurrentTime();

  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData == 0) {
    SELOGINIT;
    LERROR << "no annotationData ! abort";
    return MISSING_DATA;
  }
  
  
  LinguisticAnalysisStructure::AnalysisGraph* graphp = static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph));
  if (graphp == 0) {
    SELOGINIT;
    LERROR << "no graph "<< m_graph <<" ! abort";
    return MISSING_DATA;
  }
  const LinguisticAnalysisStructure::AnalysisGraph& graph = *graphp;
  LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph());
  VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph);
  
  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
      SELOGINIT;
      LERROR << "no LinguisticMetaData ! abort";
      return MISSING_DATA;
  }

  DumperStream* dstream=initialize(analysis);
  ostream& out=dstream->out();

  uint64_t offset(0);
  try {
    offset=atoi(metadata->getMetaData("StartOffset").c_str());
  }
  catch (LinguisticProcessingException& ) {
    // do nothing: not set in analyzeText (only in analyzeXmlDocuments)
  }

  uint64_t offsetIndexingNode(0);
  try {
    offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str());
  }
  catch (LinguisticProcessingException& ) {
    // do nothing: not set in analyzeText (only in analyzeXmlDocuments)
  }

  std::string docId("");
  try {
    docId=metadata->getMetaData("DocId");
  }
  catch (LinguisticProcessingException& ) {
    // do nothing: not set in analyzeText (only in analyzeXmlDocuments)
  }

  if (m_compactFormat) {
    out << "<entities docid=\"" << docId
    << "\" offsetNode=\"" << offsetIndexingNode 
    << "\" offset=\"" << offset
    << "\">" << endl;
  }
  else {
    out << "<specific_entities>" << endl;
  }
//   SELOGINIT;

  if (m_followGraph) {
    // instead of looking to all annotations, follow the graph (in
    // morphological graph, some vertices are not related to main graph:
    // idiomatic expressions parts and named entity parts)
    // -> this will not include nested entities

    AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData(m_graph));
    if (tokenList==0) {
      LERROR << "graph " << m_graph << " has not been produced: check pipeline";
      return MISSING_DATA;
    }
    LinguisticGraph* graph=tokenList->getGraph();
    //const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language);
    
    std::queue<LinguisticGraphVertex> toVisit;
    std::set<LinguisticGraphVertex> visited;
    toVisit.push(tokenList->firstVertex());
    
    LinguisticGraphOutEdgeIt outItr,outItrEnd;
    while (!toVisit.empty()) {
      LinguisticGraphVertex v=toVisit.front();
      toVisit.pop();
      if (v == tokenList->lastVertex()) {
        continue;
      }
      
      for (boost::tie(outItr,outItrEnd)=out_edges(v,*graph); outItr!=outItrEnd; outItr++) 
      {
        LinguisticGraphVertex next=target(*outItr,*graph);
        if (visited.find(next)==visited.end())
        {
          visited.insert(next);
          toVisit.push(next);
        }
      }
      const SpecificEntityAnnotation* annot=getSpecificEntityAnnotation(v,annotationData);
      if (annot != 0) {
        outputEntity(out,v,annot,tokenMap,offset);
      }
    }
  }
  else {
    // take all annotations
    AnnotationGraphVertexIt itv, itv_end;
    boost::tie(itv, itv_end) = vertices(annotationData->getGraph());
    for (; itv != itv_end; itv++)
    {
      //     LDEBUG << "SpecificEntitiesXmlLogger on annotation vertex " << *itv;
      if (annotationData->hasAnnotation(*itv,Common::Misc::utf8stdstring2limastring("SpecificEntity")))
      {
        //       LDEBUG << "    it has SpecificEntityAnnotation";
        const SpecificEntityAnnotation* annot = 0;
        try
        {
          annot = annotationData->annotation(*itv,Common::Misc::utf8stdstring2limastring("SpecificEntity"))
          .pointerValue<SpecificEntityAnnotation>();
        }
        catch (const boost::bad_any_cast& )
        {
          SELOGINIT;
          LERROR << "This annotation is not a SpecificEntity; SE not logged";
          continue;
        }
        
        // recuperer l'id du vertex morph cree
        LinguisticGraphVertex v;
        if (!annotationData->hasIntAnnotation(*itv,Common::Misc::utf8stdstring2limastring(m_graph)))
        {
          //         SELOGINIT;
          //         LDEBUG << *itv << " has no " << m_graph << " annotation. Skeeping it.";
          continue;
        }
        v = annotationData->intAnnotation(*itv,Common::Misc::utf8stdstring2limastring(m_graph));
        outputEntity(out,v,annot,tokenMap,offset);
      }
    }
  }   
  
  //   LDEBUG << "    all vertices done";
  if (m_compactFormat) {
    out << "</entities>" << endl;
  }
  else {
    out << "</specific_entities>" << endl;
  }
  delete dstream;
  TimeUtils::logElapsedTime("SpecificEntitiesXmlLogger");
  return SUCCESS_ID;
  
}
Exemplo n.º 3
0
LimaStatusCode EntityTracker::process(AnalysisContent& analysis) const
{
  TimeUtils::updateCurrentTime();
  SELOGINIT;

  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0)
  {
    LERROR << "no LinguisticMetaData ! abort" << LENDL;
    return MISSING_DATA;
  }

  AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"));
  if (anagraph==0)
  {
    LERROR << "no graph 'AnaGraph' available !" << LENDL;
    return MISSING_DATA;
  }

  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData==0)
  {
    LERROR << "no annotation graph available !" << LENDL;
    return MISSING_DATA;
  }

  // add new data to store co-references
  CoreferenceData* corefData = new CoreferenceData;
  analysis.setData("CoreferenceData",corefData);
  
  CoreferenceEngine ref;
  LinguisticGraph* graph=anagraph->getGraph();
  LinguisticGraphVertex lastVertex=anagraph->lastVertex();
  LinguisticGraphVertex firstVertex=anagraph->firstVertex();

  std::queue<LinguisticGraphVertex> toVisit;
  std::set<LinguisticGraphVertex> visited;

  LinguisticGraphOutEdgeIt outItr,outItrEnd;

  // output vertices between begin and end,
  // but do not include begin (beginning of text or previous end of sentence) and include end (end of sentence)
  toVisit.push(firstVertex);

  bool first=true;
  bool last=false;
  while (!toVisit.empty()) {
    LinguisticGraphVertex v=toVisit.front();
    toVisit.pop();
    if (last || v == lastVertex) {
      continue;
    }
    if (v == lastVertex) {
      last=true;
    }

    for (boost::tie(outItr,outItrEnd)=out_edges(v,*graph); outItr!=outItrEnd; outItr++)
    {
      LinguisticGraphVertex next=target(*outItr,*graph);
      if (visited.find(next)==visited.end())
      {
        visited.insert(next);
        toVisit.push(next);
      }
    }

    if (first) {
      first=false;
    }
    else {
   // first, check if vertex corresponds to a specific entity
    std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",v,"annot");
    for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin();
         it != matches.end(); it++)
    {
      AnnotationGraphVertex vx=*it;
      Token* t=get(vertex_token,*graph,vx);
      /* sauvegarde de tous les vertex */
      if (t != 0)
      {
        //storeAllToken(t);
        //allToken.push_back(t);
        ref.storeAllToken(*t);
      }
      if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")))
      {
        /*const SpecificEntityAnnotation* se =
          annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")).
          pointerValue<SpecificEntityAnnotation>();*/
        //storeSpecificEntity(se);
        //Token* t=get(vertex_token,*graph,vx);
        //storedAnnotations.push_back(*t);
        ref.storeAnnot(*t);
//             std::cout<< "le vertex de nom "<< t->stringForm()<<std::endl;
      }
      }
    }
  }

  /* recherche des coréferences entre les entitées nommées précédemment détectées */

  vector<Token> vectTok;
  vector<Token>::const_iterator it1=ref.getAnnotations().begin(), it1_end=ref.getAnnotations().end();
  for (;
       it1 != it1_end;
       it1++)
  {
//     checkCoreference (*it1,ref);
    vectTok = ref.searchCoreference(*it1);
    if (vectTok.size() > 0)
    {
      corefData->push_back(vectTok);
    }
    ref.searchCoreference(*it1);
  }

  /* get the text */
//   LimaStringText* text=static_cast<LimaStringText*>(analysis.getData("Text"));
  
  return SUCCESS_ID;
}
Exemplo n.º 4
0
LimaStatusCode SemanticRelationsXmlLogger::
process(AnalysisContent& analysis) const
{
  TimeUtils::updateCurrentTime();
  
  SEMLOGINIT;
  LERROR << "SemanticRelationsXmlLogger" << LENDL;
    
  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  
  const LinguisticAnalysisStructure::AnalysisGraph& graph = 
    *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph)));
  
  LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph());
  VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph);
  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
      SEMLOGINIT;
      LERROR << "no LinguisticMetaData ! abort" << LENDL;
      return MISSING_DATA;
  }

  ofstream out;
  if (!openLogFile(out,metadata->getMetaData("FileName"))) {
    SEMLOGINIT;
    LERROR << "Can't open log file " << LENDL;
    return UNKNOWN_ERROR;
  }

  uint64_t offset(0);
  try {
    offset=atoi(metadata->getMetaData("StartOffset").c_str());
  }
  catch (LinguisticProcessingException& e) {
    // do nothing: not set in analyzeText (only in analyzeXmlDocuments)
  }

  uint64_t offsetIndexingNode(0);
  try {
    offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str());
  }
  catch (LinguisticProcessingException& e) {
    // do nothing: not set in analyzeText (only in analyzeXmlDocuments)
  }

  std::string docId("");
  try {
    docId=metadata->getMetaData("DocId");
  }
  catch (LinguisticProcessingException& e) {
    // do nothing: not set in analyzeText (only in analyzeXmlDocuments)
  }

  out << "<relations docid=\"" << docId
      << "\" offsetNode=\"" << offsetIndexingNode 
      << "\">" << endl;

//   LDEBUG << "SemanticRelationsXmlLogger on graph " << m_graph << LENDL;
  
  //look at all vertices for annotations
  AnnotationGraphVertexIt itv, itv_end;
  boost::tie(itv, itv_end) = vertices(annotationData->getGraph());
  for (; itv != itv_end; itv++)
  {
    LDEBUG << "SemanticRelationsXmlLogger on annotation vertex " << *itv << LENDL;
    if (annotationData->hasAnnotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation")))
    {
//       LDEBUG << "    it has SemanticRelationAnnotation" << LENDL;
      const SemanticAnnotation* annot = 0;
      try
      {
        annot = annotationData->annotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation"))
          .pointerValue<SemanticAnnotation>();
      }
      catch (const boost::bad_any_cast& e)
      {
        SEMLOGINIT;
        LERROR << "This annotation is not a SemanticRelation" << LENDL;
        continue;
      }

      // output
      out << "<annotation type=\"" << annot->getType() << "\">" << endl
          << vertexStringForSemanticAnnotation("vertex",*itv,tokenMap,annotationData,offset)
          << "</annotation>" << endl;
    }
  }

  // look at all edges for relations
  AnnotationGraphEdgeIt it,it_end;
  const AnnotationGraph& annotGraph=annotationData->getGraph();
  boost::tie(it, it_end) = edges(annotGraph);
  for (; it != it_end; it++) {
    LDEBUG << "SemanticRelationsXmlLogger on annotation edge " 
           << source(*it,annotGraph) << "->" << target(*it,annotationData->getGraph()) << LENDL;
    if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation")))
    {
      SEMLOGINIT;
      LDEBUG << "found semantic relation" << LENDL;
      const SemanticRelationAnnotation* annot = 0;
      try
      {
        annot = annotationData->annotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation"))
          .pointerValue<SemanticRelationAnnotation>();
      }
      catch (const boost::bad_any_cast& e)
      {
        SEMLOGINIT;
        LERROR << "This annotation is not a SemanticAnnotation" << LENDL;
        continue;
      }

      //output
      out << "<relation type=\"" << annot->type() << "\">" << endl
          << vertexStringForSemanticAnnotation("source",source(*it,annotGraph),tokenMap,annotationData,offset)
          << vertexStringForSemanticAnnotation("target",target(*it,annotGraph),tokenMap,annotationData,offset)
          << "</relation>" << endl;
      
    }
  }

//   LDEBUG << "    all vertices done" << LENDL;
  out << "</relations>" << endl;
  out.close();

  TimeUtils::logElapsedTime("SemanticRelationsXmlLogger");
  return SUCCESS_ID;
}
      LimaStatusCode CorefSolvingNormalizedXmlLogger::process(
        AnalysisContent& analysis) const
      {
//         COREFSOLVERLOGINIT;
        TimeUtils::updateCurrentTime();
        AnnotationData* annotationData = static_cast<AnnotationData*>(analysis.getData("AnnotationData"));
        const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph)));

//         LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph());
        LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
        if (metadata == 0)
        {
          COREFSOLVERLOGINIT;
          LERROR << "no LinguisticMetaData ! abort" << LENDL;
          return MISSING_DATA;
        }

        ofstream out;
        if (!openLogFile(out,metadata->getMetaData("FileName")))
        {
          COREFSOLVERLOGINIT;
          LERROR << "Can't open log file " << LENDL;
          return UNKNOWN_ERROR;
        }

        out << "<coreferences>" << endl;


        //   LDEBUG << "CorefSolvingNormalizedXmlLogger on graph " << m_graph << LENDL;
        AnnotationGraphVertexIt itv, itv_end;
        boost::tie(itv, itv_end) = vertices(annotationData->getGraph());
        for (; itv != itv_end; itv++)
        {
          // process
          //LDEBUG << "CorefSolvingNormalizedXmlLogger on annotation vertex " << *itv << LENDL;
          if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent")))
            //if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent")))
          {
            CoreferentAnnotation* annot ;
            try
            {
              annot = annotationData->annotation(*itv,utf8stdstring2limastring("Coreferent"))
                      .pointerValue<CoreferentAnnotation>();
            }
            catch (const boost::bad_any_cast& )
            {
              COREFSOLVERLOGINIT;
              LERROR << "One annotation on vertex " << *itv << " you are trying to cast is not a Coreference; Coreference not logged" << LENDL;
              for (int i = 0; i < 19 ; i++)
              {
                LERROR << "annot "<< i << " : " << limastring2utf8stdstring(annotationData->annotationName(i)) << LENDL ;
              }
              continue;
            }
            LinguisticProcessing::LinguisticAnalysisStructure::Token* token = get(vertex_token, *graph.getGraph(), annot->morphVertex());
            if (token == 0)
            {
              COREFSOLVERLOGINIT;
              LERROR << "Vertex " << *itv << " has no entry in the analysis graph token map. This should not happen !!" << LENDL;
            }
            else
            {
              CoreferentAnnotation* antecedent;
//               bool hasAntecedent = false;
              AnnotationGraphOutEdgeIt it, it_end;
              boost::tie(it, it_end) = boost::out_edges(static_cast<AnnotationGraphVertex>(*itv), annotationData->getGraph());

              for (; it != it_end; it++)
              {
                if (annotationData->hasAnnotation(target(*it,annotationData->getGraph()),utf8stdstring2limastring("Coreferent")))
                {
                  try
                  {
                    antecedent = annotationData->annotation(target(*it, annotationData->getGraph()), utf8stdstring2limastring("Coreferent")).pointerValue<CoreferentAnnotation>();
//                     hasAntecedent = true;
                  }
                  catch (const boost::bad_any_cast& )
                  {
                    COREFSOLVERLOGINIT;
                    LERROR << "One annotation on vertex you are trying to cast resulting from an edge out of " << *itv << " is not a Coreference; Coreference not logged" << LENDL;
                    continue;
                  }
                }
              }
              out << "  <reference>\n"
              << "    <pos>" << get(vertex_token,*graph.getGraph(),annot->morphVertex())->position() << "</pos>\n"
              << "    <len>" << token->stringForm().length() << "</len>\n"
              << "    <string>"<< limastring2utf8stdstring(transcodeToXmlEntities(token->stringForm())) << "</string>\n"
              << "    <npId>" << annot->id() << "</npId>\n"
              << "    <posVertex>" << annot->morphVertex() << "</posVertex>\n";
              //if (hasAntecedent)
              if (false)
              {
                out << "    <npRef>" << antecedent->id() << "</npRef>\n";
                out << "    <refPosVertex>" << antecedent->morphVertex() << "</refPosVertex>\n";
              }
              out << "    <categ>" << annot->categ() << "</categ>\n"
                    << "  </reference>\n"
              << endl;
            }
          }
        }
        out << "</coreferences>" << endl;
        out.close();

        TimeUtils::logElapsedTime("CorefSolvingNormalizedXmlLogger");
        return SUCCESS_ID;

      }