Esempio n. 1
0
bool EntityGroupTransition::
compare(const LinguisticAnalysisStructure::AnalysisGraph& graph,
        const LinguisticGraphVertex& v,
        AnalysisContent& analysis,
        const LinguisticAnalysisStructure::Token* /*token*/,
        const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const
{
  // should compare to vertex ?
  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData==0) {
    AULOGINIT;
    LDEBUG << "EntityGroupTransition::compare: no annotation graph available !";
    return false;
  }

  // find annotationGraphVertex matching the vertex of the current graph
  std::set<AnnotationGraphVertex> matches = annotationData->matches(graph.getGraphId(), v, "annot");
  if (matches.empty())
  {
    AULOGINIT;
    LDEBUG << "annotation ("<<graph.getGraphId()<<", "<<v<<", \"annot\") available";
    return false;
  }
  AnnotationGraphVertex annotVertex = *(matches.begin());

  if (!annotationData->hasAnnotation(annotVertex, m_entityAnnotation))
  {
    AULOGINIT;
    LDEBUG << "EntityGroupTransition::compare: No " << m_entityAnnotation << " annotation available on " << v;
    return false;
  }
  
  const SpecificEntityAnnotation* se =
    annotationData->annotation(annotVertex, m_entityAnnotation).
    pointerValue<SpecificEntityAnnotation>();
  Common::MediaticData::EntityType type = se->getType();
  AULOGINIT;
  LDEBUG << "EntityGroupTransition::compare: type = " << type << ", groupId = " << type.getGroupId();
  LDEBUG << "EntityGroupTransition::compare: m_entityGroupId = " << m_entityGroupId;
  LDEBUG << "EntityGroupTransition::compare: tests m_entityGroupId == type.getGroupId() = " << (m_entityGroupId == type.getGroupId());
  return( m_entityGroupId == type.getGroupId() );
}
Esempio n. 2
0
LimaStatusCode EntityTracker::process(AnalysisContent& analysis) const
{
  TimeUtils::updateCurrentTime();
  SELOGINIT;

  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0)
  {
    LERROR << "no LinguisticMetaData ! abort" << LENDL;
    return MISSING_DATA;
  }

  AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"));
  if (anagraph==0)
  {
    LERROR << "no graph 'AnaGraph' available !" << LENDL;
    return MISSING_DATA;
  }

  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData==0)
  {
    LERROR << "no annotation graph available !" << LENDL;
    return MISSING_DATA;
  }

  // add new data to store co-references
  CoreferenceData* corefData = new CoreferenceData;
  analysis.setData("CoreferenceData",corefData);
  
  CoreferenceEngine ref;
  LinguisticGraph* graph=anagraph->getGraph();
  LinguisticGraphVertex lastVertex=anagraph->lastVertex();
  LinguisticGraphVertex firstVertex=anagraph->firstVertex();

  std::queue<LinguisticGraphVertex> toVisit;
  std::set<LinguisticGraphVertex> visited;

  LinguisticGraphOutEdgeIt outItr,outItrEnd;

  // output vertices between begin and end,
  // but do not include begin (beginning of text or previous end of sentence) and include end (end of sentence)
  toVisit.push(firstVertex);

  bool first=true;
  bool last=false;
  while (!toVisit.empty()) {
    LinguisticGraphVertex v=toVisit.front();
    toVisit.pop();
    if (last || v == lastVertex) {
      continue;
    }
    if (v == lastVertex) {
      last=true;
    }

    for (boost::tie(outItr,outItrEnd)=out_edges(v,*graph); outItr!=outItrEnd; outItr++)
    {
      LinguisticGraphVertex next=target(*outItr,*graph);
      if (visited.find(next)==visited.end())
      {
        visited.insert(next);
        toVisit.push(next);
      }
    }

    if (first) {
      first=false;
    }
    else {
   // first, check if vertex corresponds to a specific entity
    std::set< AnnotationGraphVertex > matches = annotationData->matches("AnalysisGraph",v,"annot");
    for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin();
         it != matches.end(); it++)
    {
      AnnotationGraphVertex vx=*it;
      Token* t=get(vertex_token,*graph,vx);
      /* sauvegarde de tous les vertex */
      if (t != 0)
      {
        //storeAllToken(t);
        //allToken.push_back(t);
        ref.storeAllToken(*t);
      }
      if (annotationData->hasAnnotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")))
      {
        /*const SpecificEntityAnnotation* se =
          annotationData->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")).
          pointerValue<SpecificEntityAnnotation>();*/
        //storeSpecificEntity(se);
        //Token* t=get(vertex_token,*graph,vx);
        //storedAnnotations.push_back(*t);
        ref.storeAnnot(*t);
//             std::cout<< "le vertex de nom "<< t->stringForm()<<std::endl;
      }
      }
    }
  }

  /* recherche des coréferences entre les entitées nommées précédemment détectées */

  vector<Token> vectTok;
  vector<Token>::const_iterator it1=ref.getAnnotations().begin(), it1_end=ref.getAnnotations().end();
  for (;
       it1 != it1_end;
       it1++)
  {
//     checkCoreference (*it1,ref);
    vectTok = ref.searchCoreference(*it1);
    if (vectTok.size() > 0)
    {
      corefData->push_back(vectTok);
    }
    ref.searchCoreference(*it1);
  }

  /* get the text */
//   LimaStringText* text=static_cast<LimaStringText*>(analysis.getData("Text"));
  
  return SUCCESS_ID;
}
bool CreateIdiomaticAlternative::operator()(Automaton::RecognizerMatch& result,
                                            AnalysisContent& analysis) const
{
#ifdef DEBUG_LP
    MORPHOLOGINIT;
    LDEBUG << "CreateIdiomaticAlternative, match is " << result;
    LDEBUG << "    expression is " << (result.isContiguous()?"":"non") <<
     " contiguous and" << (result.isContextual()?" non":"") << " absolute";
#endif
  if (result.empty()) return false;
  const LinguisticAnalysisStructure::AnalysisGraph& graph = *(result.getGraph());
  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData->dumpFunction("IdiomExpr") == 0)
  {
    annotationData->dumpFunction("IdiomExpr", new DumpIdiomaticExpressionAnnotation());
  }
  
  RecognizerData* recoData=static_cast<RecognizerData*>(analysis.getData("RecognizerData"));
  
  std::set<LinguisticGraphVertex> addedVertices;
  // initialize the vertices to clear

  if (result.isContiguous())
  {
//     MORPHOLOGINIT;
//      LDEBUG << "contiguous idiomatic expression found: "
//          << result.concatString();

    // only one part : terms in expression are adjacent -> easy part

    // check if there is an overlap first
    if (recoData->matchOnRemovedVertices(result))
    {
      // ignore current idiomatic expression, continue
      MORPHOLOGINIT;
      LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString())
          << ": overlapping with a previous one";
      return false;
    }

    // create the new token
    std::pair<Token*,MorphoSyntacticData*> newToken = createAlternativeToken(result);
    if (newToken.second->empty())
    {
      // ignore current idiomatic expression, continue
      MORPHOLOGINIT;
      LERROR << "CreateIdiomaticAlternative::operator() Got empty morphosyntactic data. Abort";
      delete newToken.first;
      delete newToken.second;
      return false;
    }

    // add the vertex
    LinguisticGraphVertex idiomaticVertex =
        addAlternativeVertex(newToken.first, newToken.second, const_cast<LinguisticGraph*>(graph.getGraph()));
    AnnotationGraphVertex agv =  annotationData->createAnnotationVertex();
    annotationData->addMatching("AnalysisGraph", idiomaticVertex, "annot", agv);
    annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), idiomaticVertex);
    IdiomaticExpressionAnnotation annot(result);
    GenericAnnotation ga(annot);
    annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("IdiomExpr"), ga);

    addedVertices.insert(idiomaticVertex);

    //create the alternative with this only vertex
    createBeginAlternative(result.front().getVertex(),
                            idiomaticVertex,const_cast<LinguisticGraph&>(*graph.getGraph()));
    attachEndOfAlternative(idiomaticVertex,
                            result.back().getVertex(),const_cast<LinguisticGraph&>(*graph.getGraph()));

    // if expression is not contextual, only keep alternative
    if (! result.isContextual())
    {
      recoData->storeVerticesToRemove(result,const_cast<LinguisticGraph*>(graph.getGraph()));
      removeEdges(const_cast<LinguisticGraph&>(*graph.getGraph()),
                 result, analysis);
      //recoData->setNextVertex(idiomaticVertex);
      // if match was on single token, use next vertices (to avoid loops)
      if (result.size() > 1) 
      {
        recoData->setNextVertex(idiomaticVertex);
      }
      else 
      {
        LinguisticGraphOutEdgeIt outItr,outItrEnd;
        boost::tie(outItr,outItrEnd) = out_edges(idiomaticVertex,*(graph.getGraph()));
        for (;outItr!=outItrEnd;outItr++) 
        {
          recoData->setNextVertex(target(*outItr, *(graph.getGraph())));
        }
      }
    }
  }
  else
  {
    // several parts : tough case
//     MORPHOLOGINIT;
//      LDEBUG << "non contiguous idiomatic expression found: "
//          << result.concatString();

    // check if there is an overlap first
    if (recoData->matchOnRemovedVertices(result))
    {
      // ignore current idiomatic expression, continue
      MORPHOLOGINIT;
      LWARN << "idiomatic expression ignored: " << Common::Misc::limastring2utf8stdstring(result.concatString())
          << ": overlapping with a previous one";
      return false;
    }

    // create the new token
    pair<Token*,MorphoSyntacticData*> newToken = createAlternativeToken(result);
    if (newToken.second->empty())
    {
      // ignore current idiomatic expression, continue
      MORPHOLOGINIT;
      LERROR << "CreateIdiomaticAlternative::operator() Got empty morphosyntactic data. Abort";
      delete newToken.first;
      delete newToken.second;
      return false;
    }

    // add the vertex
    LinguisticGraphVertex idiomaticVertex =
        addAlternativeVertex(newToken.first,newToken.second,const_cast<LinguisticGraph*>(graph.getGraph()));
    addedVertices.insert(idiomaticVertex);
    AnnotationGraphVertex agv =  annotationData->createAnnotationVertex();
    annotationData->addMatching("AnalysisGraph", idiomaticVertex, "annot", agv);
    annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), idiomaticVertex);
    IdiomaticExpressionAnnotation annot(result);
    GenericAnnotation ga(annot);
    annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("IdiomExpr"), ga);

    //create the alternative with this vertex and duplicate of other vertices
    deque<LinguisticGraphVertex> idiomAlternative;
    LinguisticGraphVertex headVertex=result.getHead();
#ifdef DEBUG_LP
   LDEBUG << "headVertex = " << headVertex;
    if (headVertex!=0) 
    {
      LDEBUG << "=> " << Common::Misc::limastring2utf8stdstring(get(vertex_token,*graph.getGraph(),headVertex)->stringForm());
    }
#endif
    bool foundHead=false;
    bool keeping = false;
    std::pair< LinguisticGraphVertex, LinguisticGraphVertex > idiomPartBounds;
    std::set< std::pair< LinguisticGraphVertex, LinguisticGraphVertex > > edgesToRemove;
    RecognizerMatch::const_iterator matchItr=result.begin();
    for (; matchItr!=result.end(); matchItr++)
    {
      if (!matchItr->isKept())
      {
        if (keeping)
        {
          RecognizerMatch::const_iterator prevItr = matchItr - 1;
          idiomPartBounds.second = prevItr->getVertex();
          keeping = false;
#ifdef DEBUG_LP
          LDEBUG << "adding " << idiomPartBounds.first << " -> " << idiomPartBounds.second << " in edgesToRemove";
#endif
          edgesToRemove.insert(idiomPartBounds);
        }
        // duplicate this vertex
#ifdef DEBUG_LP
        LDEBUG << "duplication of vertex " << matchItr->getVertex();;
#endif
        Token* token=get(vertex_token,*graph.getGraph(),matchItr->getVertex());
        MorphoSyntacticData* data = 
          new MorphoSyntacticData(*get(vertex_data,*graph.getGraph(),matchItr->getVertex()));
        LinguisticGraphVertex dupVx = add_vertex(const_cast<LinguisticGraph&>(*graph.getGraph()));
        put(vertex_token,const_cast<LinguisticGraph&>(*graph.getGraph()),dupVx,token);
        put(vertex_data,const_cast<LinguisticGraph&>(*graph.getGraph()),dupVx,data);
        idiomAlternative.push_back(dupVx);
        AnnotationGraphVertex agv =  annotationData->createAnnotationVertex();
        annotationData->addMatching("AnalysisGraph", dupVx, "annot", agv);
        annotationData->annotate(agv, Common::Misc::utf8stdstring2limastring("AnalysisGraph"), dupVx);
        std::set< LinguisticGraphVertex > annotMatches = 
          annotationData->matches("AnalysisGraph",matchItr->getVertex(),"annot");
        for (std::set< LinguisticGraphVertex >::const_iterator annotIt(annotMatches.begin());
              annotIt != annotMatches.end(); annotIt++)
        {
          std::set< std::string > excepted;
          excepted.insert("AnalysisGraph");
          annotationData->cloneAnnotations(*annotIt, agv, excepted);
        }
        addedVertices.insert(dupVx);
//         verticesToRemove.insert(matchItr->getVertex());
      }
      else
      {
        if (!keeping)
        {
          idiomPartBounds.first = matchItr->getVertex();
          keeping = true;
        }
#ifdef DEBUG_LP
         LDEBUG << "kept vertex " << matchItr->getVertex();
#endif
        if (matchItr->getVertex()==headVertex)
        {
          foundHead=true;
#ifdef DEBUG_LP
           LDEBUG << "add head vertex " << idiomaticVertex;
#endif
          idiomAlternative.push_back(idiomaticVertex);
        }
      }
    }
    if (!foundHead) 
    {
      MORPHOLOGINIT;
      LWARN << "head token has not been found in non contiguous expression. "
          << "Idiomatic token is placed first";
      idiomAlternative.push_front(idiomaticVertex);
    }
    if (keeping)
    {
      RecognizerMatch::const_iterator prevItr = matchItr - 1;
      idiomPartBounds.second = prevItr->getVertex();
      keeping = false;
#ifdef DEBUG_LP
      LDEBUG << "adding " << idiomPartBounds.first << " -> " << idiomPartBounds.second << " in edgesToRemove";
#endif
      edgesToRemove.insert(idiomPartBounds);
    }

    // link alternatives
#ifdef DEBUG_LP
     LDEBUG << "idiomAlternative has " << idiomAlternative.size() << " vertex";
#endif
    createBeginAlternative(result.front().getVertex(),
                            idiomAlternative.front(),const_cast<LinguisticGraph&>(*graph.getGraph()));
    {
      deque<LinguisticGraphVertex>::const_iterator idItr=idiomAlternative.begin();
      LinguisticGraphVertex lastIdiomVx=*idItr;
      idItr++;
      while (idItr!=idiomAlternative.end())
      {
        LinguisticGraphEdge newEdge;
        bool ok;
        boost::tie(newEdge, ok) = add_edge(lastIdiomVx,*idItr,const_cast<LinguisticGraph&>(*graph.getGraph()));
#ifdef DEBUG_LP
         LDEBUG << "added new edge in alternatives linking: " << newEdge.m_source << " -> " << newEdge.m_target;
#endif
        lastIdiomVx=*idItr;
        idItr++;
      }
    }
    attachEndOfAlternative(idiomAlternative.back(),
                            result.back().getVertex(),const_cast<LinguisticGraph&>(*graph.getGraph()));

    // if expression is not contextual, only keep alternative
    if (! result.isContextual())
    {
#ifdef DEBUG_LP
      LDEBUG << "expression is not contextual, only keep alternative";
#endif
      std::set< std::pair< LinguisticGraphVertex, LinguisticGraphVertex > >::const_iterator edgesToRemoveIt, edgesToRemoveIt_end;
      edgesToRemoveIt = edgesToRemove.begin(); edgesToRemoveIt_end = edgesToRemove.end();
      for (; edgesToRemoveIt != edgesToRemoveIt_end; edgesToRemoveIt++)
      {
#ifdef DEBUG_LP
         LDEBUG << "Removing edge " << (*edgesToRemoveIt).first << " -> " << (*edgesToRemoveIt).second;
#endif
        removeEdges(const_cast<LinguisticGraph&>(*graph.getGraph()),
                   result, analysis);
      }

//       recoData->storeVerticesToRemove(result,*graph);
      // no need to check size: if several parts, more than one vertex
      recoData->setNextVertex(idiomaticVertex);
      
    }
  }
  RecognizerMatch::const_iterator matchItr=result.begin();
  for (; matchItr!=result.end(); matchItr++)
  {
    recoData->clearUnreachableVertices( analysis, (*matchItr).getVertex());
  }
//   recoData->clearUnreachableVertices( analysis, result.front().getVertex(), result.back().getVertex(), storedEdges);
  return true;
}