LimaStatusCode EntityTrackerXmlLogger::process(
  AnalysisContent& analysis) const
{
  SELOGINIT;
  LDEBUG << "EntityTrackerXmlLogger::process";
  TimeUtils::updateCurrentTime();
  /* permet de récupérer les annotations */
  //AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));

  /* recupérer le graph après l'analyse */
  //const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph)));

  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
      SELOGINIT;
      LERROR << "no LinguisticMetaData ! abort";
      return MISSING_DATA;
  }

  CoreferenceData* corefData=static_cast<CoreferenceData*>(analysis.getData("CoreferenceData"));
  if (corefData == 0) {
      SELOGINIT;
      LERROR << "no CoreferenceData ! abort";
      return MISSING_DATA;
  }

  ofstream out;
  if (!openLogFile(out,metadata->getMetaData("FileName"))) {
    SELOGINIT;
    LERROR << "Can't open log file '" << metadata->getMetaData("FileName") << "'";
    return UNKNOWN_ERROR;
  }

  out << "<coreference>" << endl;
  for (CoreferenceData::const_iterator it=corefData->begin(), 
         it_end=corefData->end(); it != it_end; it++)
  {
    out << "<entity mentions=\"" << (*it).size() << "\">" << endl;
    for (vector<Token>::const_iterator it2=(*it).begin(), it2_end=(*it).end();
         it2 != it2_end; it2++)
    {
      out << "  <entity_mention>" 
          << limastring2utf8stdstring((*it2).stringForm())
          <<"</entity_mention>";
    }
    out << "<entity>" <<endl;
  }
  out.close();

  return SUCCESS_ID;
}
Exemple #2
0
void CoreXmlReaderClient::handle(const DocumentsReader::ContentStructuredDocument &contentDocument,
                                 const Lima::LimaString &text, unsigned long int offset, const string  tagName)
{
#ifdef DEBUG_LP
  XMLREADERCLIENTLOGINIT;
#endif

    if(std::string(text.toUtf8().constData()).find_first_not_of(m_emptyTextChars.toUtf8().constData()) == string::npos) {
#ifdef DEBUG_LP
        LDEBUG << "CoreXmlReaderClient::empty text, not analyzed";
#endif
        return;
    }

    AbstractStructuredDocumentElement* absElement = contentDocument.back();
    DocumentsReader::IndexingDocumentElement* element  =
        dynamic_cast<DocumentsReader::IndexingDocumentElement*>(absElement);
    std::string elementName = element->getElementName().toUtf8().constData();

#ifdef DEBUG_LP
    if( logger.loggingLevel() == QsLogging::DebugLevel )
    {
      LDEBUG << "CoreXmlReaderClient::handle"
            << "[" << text << "], offset =" << offset
            << ", tagName =" << tagName << ", element name =" << elementName ;
    }
    else if( logger.loggingLevel() == QsLogging::InfoLevel )
#endif
    {
      // Chercher les analyses diponibles
      XMLREADERCLIENTLOGINIT;
      LINFO << "CoreXmlReaderClient::handle"
            << "[" << text.left(50) << "], offset =" << offset
            << ", tagName =" << tagName ;
    }
    ostringstream os;
    os << offset;
    m_docMetaData["StartOffset"] = os.str();
    m_docMetaData["ElementName"] = tagName;
    // Set the language to the one associated at init time to the current tag
    if (m_mapTagMedia.find(elementName) != m_mapTagMedia.end())
    {
#ifdef DEBUG_LP
      LDEBUG << "CoreXmlReaderClient::handle using media" << m_mapTagMedia[elementName];
#endif
      m_docMetaData["Lang"] =  m_mapTagMedia[elementName];
    }
    else if (!m_defaultMedia.empty())
    {
#ifdef DEBUG_LP
      LDEBUG << "CoreXmlReaderClient::handle using default media" << m_defaultMedia;
#endif
      m_docMetaData["Lang"] =  m_defaultMedia;
    }
    else
    {
      XMLREADERCLIENTLOGINIT;
      LERROR << "CoreXmlReaderClient::handle no media associated to tag"
          << elementName
          << "and no default media is set. metadata Lang will not be set.";
    }
    // cast element to GenericDocumentProperties
//     Common::Misc::GenericDocumentProperties &props = *element;
    // get byte offset after end of element
    unsigned long offsetIndexingNode = element->getIntValue("offBegPrpty").first;
    ostringstream os2;
    os2 << offsetIndexingNode;
    m_docMetaData["StartOffsetIndexingNode"] = os2.str();

    string strText = limastring2utf8stdstring(text);

//     size_t posEmptyTextChars = strText.find_first_not_of(m_emptyTextChars.toUtf8().constData());
//     if (posEmptyTextChars!=string::npos)
//       strText=strText.substr(posEmptyTextChars,strText.length()-posEmptyTextChars);

    m_handler->handleProc(
        tagName,
        strText,
        m_docMetaData,
        m_docMetaData["pipeline"],
        m_mapHandlers,
        std::set<std::string>());
}
Exemple #3
0
void SyntacticAnalysisXmlLogger::outputVertex(const LinguisticGraphVertex v,
        const LinguisticGraph& graph,
        const uint64_t offsetBegin,
        const SyntacticData* syntacticData,
        std::ostream& xmlStream,
        std::map< LinguisticAnalysisStructure::Token*, uint64_t >& tokens,
        std::vector< bool >& alreadyDumpedTokens) const
{
    if (v == syntacticData->iterator()->firstVertex() ||
            v == syntacticData->iterator()->lastVertex())
    {
        xmlStream << "<vertex id=\"" << v << "\" />" << std::endl;
        return;
    }
    Token* token = get(vertex_token, graph, v);

    uint64_t tokenId = (*(tokens.find(token))).second;
//    bool alreadyDumped = alreadyDumpedTokens[tokenId];

    xmlStream << "<vertex id=\"" << v << "\" form=\"" << limastring2utf8stdstring(token->stringForm()) << "\" pos=\"" << getPosition(token->position(),offsetBegin) << "\" ";
    const VertexChainIdProp& chains = get(vertex_chain_id, graph,v);
    xmlStream << " >" << std::endl;
    if (chains.size() > 0)
    {
        xmlStream << "<chains>" << std::endl;
        VertexChainIdProp::const_iterator itChains, itChains_end;
        itChains = chains.begin();
        itChains_end = chains.end();
        for (; itChains != itChains_end; itChains++)
        {
            const ChainIdStruct& ids = (*itChains);
            xmlStream << "<chain type=\"";
            if (ids.chainType() == Common::MediaticData::NO_CHAIN_TYPE)
                xmlStream << "0";
            else if (ids.chainType() == Common::MediaticData::NOMINAL)
                xmlStream << "N";
            else
                xmlStream << "V";
            xmlStream << "\" id=\"" << (ids.chainId()) << "\" />" << std::endl;
        }
        xmlStream << "</chains>" << std::endl;
    }

    const DependencyGraph* depGraph = syntacticData->dependencyGraph();
    DependencyGraphVertex depV = syntacticData->depVertexForTokenVertex(v);
    if (out_degree(depV, *depGraph) > 0)
    {

        xmlStream << "<dependents>" << std::endl;
        DependencyGraphOutEdgeIt depIt, depIt_end;
        boost::tie(depIt, depIt_end) = out_edges(depV, *depGraph);
        for (; depIt != depIt_end; depIt++)
        {
            DependencyGraphVertex depTargV = target(*depIt, *depGraph);
            LinguisticGraphVertex targV = syntacticData-> tokenVertexForDepVertex(depTargV);
//             CEdgeDepChainIdPropertyMap chainsMap = get(edge_depchain_id, *depGraph);
            CEdgeDepRelTypePropertyMap relTypeMap = get(edge_deprel_type, *depGraph);
            xmlStream << "<dep v=\"" << targV;
//             xmlStream << "\" c=\"" << chainsMap[*depIt];
            std::string relName=static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getSyntacticRelationName(relTypeMap[*depIt]);
            if (relName.empty())
            {
                relName="UNKNOWN";
            }
            xmlStream << "\" t=\"" << relName << "\" />" << std::endl;
        }
        xmlStream << "</dependents>" << std::endl;
    }

    const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language);

    MorphoSyntacticData* word = get(vertex_data, graph, v);
    word->outputXml(xmlStream,*m_propertyCodeManager,sp);
    xmlStream << "<ref>" << tokenId << "</ref>" << std::endl;
    alreadyDumpedTokens[tokenId] = true;
    xmlStream << "</vertex>" << std::endl;

}
      LimaStatusCode CorefSolvingNormalizedXmlLogger::process(
        AnalysisContent& analysis) const
      {
//         COREFSOLVERLOGINIT;
        TimeUtils::updateCurrentTime();
        AnnotationData* annotationData = static_cast<AnnotationData*>(analysis.getData("AnnotationData"));
        const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph)));

//         LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph());
        LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
        if (metadata == 0)
        {
          COREFSOLVERLOGINIT;
          LERROR << "no LinguisticMetaData ! abort" << LENDL;
          return MISSING_DATA;
        }

        ofstream out;
        if (!openLogFile(out,metadata->getMetaData("FileName")))
        {
          COREFSOLVERLOGINIT;
          LERROR << "Can't open log file " << LENDL;
          return UNKNOWN_ERROR;
        }

        out << "<coreferences>" << endl;


        //   LDEBUG << "CorefSolvingNormalizedXmlLogger on graph " << m_graph << LENDL;
        AnnotationGraphVertexIt itv, itv_end;
        boost::tie(itv, itv_end) = vertices(annotationData->getGraph());
        for (; itv != itv_end; itv++)
        {
          // process
          //LDEBUG << "CorefSolvingNormalizedXmlLogger on annotation vertex " << *itv << LENDL;
          if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent")))
            //if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent")))
          {
            CoreferentAnnotation* annot ;
            try
            {
              annot = annotationData->annotation(*itv,utf8stdstring2limastring("Coreferent"))
                      .pointerValue<CoreferentAnnotation>();
            }
            catch (const boost::bad_any_cast& )
            {
              COREFSOLVERLOGINIT;
              LERROR << "One annotation on vertex " << *itv << " you are trying to cast is not a Coreference; Coreference not logged" << LENDL;
              for (int i = 0; i < 19 ; i++)
              {
                LERROR << "annot "<< i << " : " << limastring2utf8stdstring(annotationData->annotationName(i)) << LENDL ;
              }
              continue;
            }
            LinguisticProcessing::LinguisticAnalysisStructure::Token* token = get(vertex_token, *graph.getGraph(), annot->morphVertex());
            if (token == 0)
            {
              COREFSOLVERLOGINIT;
              LERROR << "Vertex " << *itv << " has no entry in the analysis graph token map. This should not happen !!" << LENDL;
            }
            else
            {
              CoreferentAnnotation* antecedent;
//               bool hasAntecedent = false;
              AnnotationGraphOutEdgeIt it, it_end;
              boost::tie(it, it_end) = boost::out_edges(static_cast<AnnotationGraphVertex>(*itv), annotationData->getGraph());

              for (; it != it_end; it++)
              {
                if (annotationData->hasAnnotation(target(*it,annotationData->getGraph()),utf8stdstring2limastring("Coreferent")))
                {
                  try
                  {
                    antecedent = annotationData->annotation(target(*it, annotationData->getGraph()), utf8stdstring2limastring("Coreferent")).pointerValue<CoreferentAnnotation>();
//                     hasAntecedent = true;
                  }
                  catch (const boost::bad_any_cast& )
                  {
                    COREFSOLVERLOGINIT;
                    LERROR << "One annotation on vertex you are trying to cast resulting from an edge out of " << *itv << " is not a Coreference; Coreference not logged" << LENDL;
                    continue;
                  }
                }
              }
              out << "  <reference>\n"
              << "    <pos>" << get(vertex_token,*graph.getGraph(),annot->morphVertex())->position() << "</pos>\n"
              << "    <len>" << token->stringForm().length() << "</len>\n"
              << "    <string>"<< limastring2utf8stdstring(transcodeToXmlEntities(token->stringForm())) << "</string>\n"
              << "    <npId>" << annot->id() << "</npId>\n"
              << "    <posVertex>" << annot->morphVertex() << "</posVertex>\n";
              //if (hasAntecedent)
              if (false)
              {
                out << "    <npRef>" << antecedent->id() << "</npRef>\n";
                out << "    <refPosVertex>" << antecedent->morphVertex() << "</refPosVertex>\n";
              }
              out << "    <categ>" << annot->categ() << "</categ>\n"
                    << "  </reference>\n"
              << endl;
            }
          }
        }
        out << "</coreferences>" << endl;
        out.close();

        TimeUtils::logElapsedTime("CorefSolvingNormalizedXmlLogger");
        return SUCCESS_ID;

      }