LimaStatusCode EntityTrackerXmlLogger::process( AnalysisContent& analysis) const { SELOGINIT; LDEBUG << "EntityTrackerXmlLogger::process"; TimeUtils::updateCurrentTime(); /* permet de récupérer les annotations */ //AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); /* recupérer le graph après l'analyse */ //const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph))); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { SELOGINIT; LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } CoreferenceData* corefData=static_cast<CoreferenceData*>(analysis.getData("CoreferenceData")); if (corefData == 0) { SELOGINIT; LERROR << "no CoreferenceData ! abort"; return MISSING_DATA; } ofstream out; if (!openLogFile(out,metadata->getMetaData("FileName"))) { SELOGINIT; LERROR << "Can't open log file '" << metadata->getMetaData("FileName") << "'"; return UNKNOWN_ERROR; } out << "<coreference>" << endl; for (CoreferenceData::const_iterator it=corefData->begin(), it_end=corefData->end(); it != it_end; it++) { out << "<entity mentions=\"" << (*it).size() << "\">" << endl; for (vector<Token>::const_iterator it2=(*it).begin(), it2_end=(*it).end(); it2 != it2_end; it2++) { out << " <entity_mention>" << limastring2utf8stdstring((*it2).stringForm()) <<"</entity_mention>"; } out << "<entity>" <<endl; } out.close(); return SUCCESS_ID; }
void CoreXmlReaderClient::handle(const DocumentsReader::ContentStructuredDocument &contentDocument, const Lima::LimaString &text, unsigned long int offset, const string tagName) { #ifdef DEBUG_LP XMLREADERCLIENTLOGINIT; #endif if(std::string(text.toUtf8().constData()).find_first_not_of(m_emptyTextChars.toUtf8().constData()) == string::npos) { #ifdef DEBUG_LP LDEBUG << "CoreXmlReaderClient::empty text, not analyzed"; #endif return; } AbstractStructuredDocumentElement* absElement = contentDocument.back(); DocumentsReader::IndexingDocumentElement* element = dynamic_cast<DocumentsReader::IndexingDocumentElement*>(absElement); std::string elementName = element->getElementName().toUtf8().constData(); #ifdef DEBUG_LP if( logger.loggingLevel() == QsLogging::DebugLevel ) { LDEBUG << "CoreXmlReaderClient::handle" << "[" << text << "], offset =" << offset << ", tagName =" << tagName << ", element name =" << elementName ; } else if( logger.loggingLevel() == QsLogging::InfoLevel ) #endif { // Chercher les analyses diponibles XMLREADERCLIENTLOGINIT; LINFO << "CoreXmlReaderClient::handle" << "[" << text.left(50) << "], offset =" << offset << ", tagName =" << tagName ; } ostringstream os; os << offset; m_docMetaData["StartOffset"] = os.str(); m_docMetaData["ElementName"] = tagName; // Set the language to the one associated at init time to the current tag if (m_mapTagMedia.find(elementName) != m_mapTagMedia.end()) { #ifdef DEBUG_LP LDEBUG << "CoreXmlReaderClient::handle using media" << m_mapTagMedia[elementName]; #endif m_docMetaData["Lang"] = m_mapTagMedia[elementName]; } else if (!m_defaultMedia.empty()) { #ifdef DEBUG_LP LDEBUG << "CoreXmlReaderClient::handle using default media" << m_defaultMedia; #endif m_docMetaData["Lang"] = m_defaultMedia; } else { XMLREADERCLIENTLOGINIT; LERROR << "CoreXmlReaderClient::handle no media associated to tag" << elementName << "and no default media is set. metadata Lang will not be set."; } // cast element to GenericDocumentProperties // Common::Misc::GenericDocumentProperties &props = *element; // get byte offset after end of element unsigned long offsetIndexingNode = element->getIntValue("offBegPrpty").first; ostringstream os2; os2 << offsetIndexingNode; m_docMetaData["StartOffsetIndexingNode"] = os2.str(); string strText = limastring2utf8stdstring(text); // size_t posEmptyTextChars = strText.find_first_not_of(m_emptyTextChars.toUtf8().constData()); // if (posEmptyTextChars!=string::npos) // strText=strText.substr(posEmptyTextChars,strText.length()-posEmptyTextChars); m_handler->handleProc( tagName, strText, m_docMetaData, m_docMetaData["pipeline"], m_mapHandlers, std::set<std::string>()); }
void SyntacticAnalysisXmlLogger::outputVertex(const LinguisticGraphVertex v, const LinguisticGraph& graph, const uint64_t offsetBegin, const SyntacticData* syntacticData, std::ostream& xmlStream, std::map< LinguisticAnalysisStructure::Token*, uint64_t >& tokens, std::vector< bool >& alreadyDumpedTokens) const { if (v == syntacticData->iterator()->firstVertex() || v == syntacticData->iterator()->lastVertex()) { xmlStream << "<vertex id=\"" << v << "\" />" << std::endl; return; } Token* token = get(vertex_token, graph, v); uint64_t tokenId = (*(tokens.find(token))).second; // bool alreadyDumped = alreadyDumpedTokens[tokenId]; xmlStream << "<vertex id=\"" << v << "\" form=\"" << limastring2utf8stdstring(token->stringForm()) << "\" pos=\"" << getPosition(token->position(),offsetBegin) << "\" "; const VertexChainIdProp& chains = get(vertex_chain_id, graph,v); xmlStream << " >" << std::endl; if (chains.size() > 0) { xmlStream << "<chains>" << std::endl; VertexChainIdProp::const_iterator itChains, itChains_end; itChains = chains.begin(); itChains_end = chains.end(); for (; itChains != itChains_end; itChains++) { const ChainIdStruct& ids = (*itChains); xmlStream << "<chain type=\""; if (ids.chainType() == Common::MediaticData::NO_CHAIN_TYPE) xmlStream << "0"; else if (ids.chainType() == Common::MediaticData::NOMINAL) xmlStream << "N"; else xmlStream << "V"; xmlStream << "\" id=\"" << (ids.chainId()) << "\" />" << std::endl; } xmlStream << "</chains>" << std::endl; } const DependencyGraph* depGraph = syntacticData->dependencyGraph(); DependencyGraphVertex depV = syntacticData->depVertexForTokenVertex(v); if (out_degree(depV, *depGraph) > 0) { xmlStream << "<dependents>" << std::endl; DependencyGraphOutEdgeIt depIt, depIt_end; boost::tie(depIt, depIt_end) = out_edges(depV, *depGraph); for (; depIt != depIt_end; depIt++) { DependencyGraphVertex depTargV = target(*depIt, *depGraph); LinguisticGraphVertex targV = syntacticData-> tokenVertexForDepVertex(depTargV); // CEdgeDepChainIdPropertyMap chainsMap = get(edge_depchain_id, *depGraph); CEdgeDepRelTypePropertyMap relTypeMap = get(edge_deprel_type, *depGraph); xmlStream << "<dep v=\"" << targV; // xmlStream << "\" c=\"" << chainsMap[*depIt]; std::string relName=static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getSyntacticRelationName(relTypeMap[*depIt]); if (relName.empty()) { relName="UNKNOWN"; } xmlStream << "\" t=\"" << relName << "\" />" << std::endl; } xmlStream << "</dependents>" << std::endl; } const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); MorphoSyntacticData* word = get(vertex_data, graph, v); word->outputXml(xmlStream,*m_propertyCodeManager,sp); xmlStream << "<ref>" << tokenId << "</ref>" << std::endl; alreadyDumpedTokens[tokenId] = true; xmlStream << "</vertex>" << std::endl; }
LimaStatusCode CorefSolvingNormalizedXmlLogger::process( AnalysisContent& analysis) const { // COREFSOLVERLOGINIT; TimeUtils::updateCurrentTime(); AnnotationData* annotationData = static_cast<AnnotationData*>(analysis.getData("AnnotationData")); const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph))); // LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph()); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { COREFSOLVERLOGINIT; LERROR << "no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } ofstream out; if (!openLogFile(out,metadata->getMetaData("FileName"))) { COREFSOLVERLOGINIT; LERROR << "Can't open log file " << LENDL; return UNKNOWN_ERROR; } out << "<coreferences>" << endl; // LDEBUG << "CorefSolvingNormalizedXmlLogger on graph " << m_graph << LENDL; AnnotationGraphVertexIt itv, itv_end; boost::tie(itv, itv_end) = vertices(annotationData->getGraph()); for (; itv != itv_end; itv++) { // process //LDEBUG << "CorefSolvingNormalizedXmlLogger on annotation vertex " << *itv << LENDL; if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent"))) //if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent"))) { CoreferentAnnotation* annot ; try { annot = annotationData->annotation(*itv,utf8stdstring2limastring("Coreferent")) .pointerValue<CoreferentAnnotation>(); } catch (const boost::bad_any_cast& ) { COREFSOLVERLOGINIT; LERROR << "One annotation on vertex " << *itv << " you are trying to cast is not a Coreference; Coreference not logged" << LENDL; for (int i = 0; i < 19 ; i++) { LERROR << "annot "<< i << " : " << limastring2utf8stdstring(annotationData->annotationName(i)) << LENDL ; } continue; } LinguisticProcessing::LinguisticAnalysisStructure::Token* token = get(vertex_token, *graph.getGraph(), annot->morphVertex()); if (token == 0) { COREFSOLVERLOGINIT; LERROR << "Vertex " << *itv << " has no entry in the analysis graph token map. This should not happen !!" << LENDL; } else { CoreferentAnnotation* antecedent; // bool hasAntecedent = false; AnnotationGraphOutEdgeIt it, it_end; boost::tie(it, it_end) = boost::out_edges(static_cast<AnnotationGraphVertex>(*itv), annotationData->getGraph()); for (; it != it_end; it++) { if (annotationData->hasAnnotation(target(*it,annotationData->getGraph()),utf8stdstring2limastring("Coreferent"))) { try { antecedent = annotationData->annotation(target(*it, annotationData->getGraph()), utf8stdstring2limastring("Coreferent")).pointerValue<CoreferentAnnotation>(); // hasAntecedent = true; } catch (const boost::bad_any_cast& ) { COREFSOLVERLOGINIT; LERROR << "One annotation on vertex you are trying to cast resulting from an edge out of " << *itv << " is not a Coreference; Coreference not logged" << LENDL; continue; } } } out << " <reference>\n" << " <pos>" << get(vertex_token,*graph.getGraph(),annot->morphVertex())->position() << "</pos>\n" << " <len>" << token->stringForm().length() << "</len>\n" << " <string>"<< limastring2utf8stdstring(transcodeToXmlEntities(token->stringForm())) << "</string>\n" << " <npId>" << annot->id() << "</npId>\n" << " <posVertex>" << annot->morphVertex() << "</posVertex>\n"; //if (hasAntecedent) if (false) { out << " <npRef>" << antecedent->id() << "</npRef>\n"; out << " <refPosVertex>" << antecedent->morphVertex() << "</refPosVertex>\n"; } out << " <categ>" << annot->categ() << "</categ>\n" << " </reference>\n" << endl; } } } out << "</coreferences>" << endl; out.close(); TimeUtils::logElapsedTime("CorefSolvingNormalizedXmlLogger"); return SUCCESS_ID; }