// Each token of the specified path is // searched into the specified dictionary. LimaStatusCode FullTokenXmlLogger::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { DICTIONARYLOGINIT; LERROR << "no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData(m_graphId)); std::ofstream fout; if (!openLogFile(fout,metadata->getMetaData("FileName"))) { MORPHOLOGINIT; LERROR << "Error: cannot open log file" << LENDL; return CANNOT_OPEN_FILE_ERROR; } dump(fout, *tokenList); fout.close(); TimeUtils::logElapsedTime("FullTokenXmlLogger"); return SUCCESS_ID; }
bool SemanticRelationData::addRelations(AnalysisContent& analysis) { #ifdef DEBUG_LP SEMANTICANALYSISLOGINIT; #endif auto annotationData = static_cast< AnnotationData* >( analysis.getData("AnnotationData")); if (annotationData->dumpFunction("SemanticRelation") == 0) { annotationData->dumpFunction("SemanticRelation", new DumpSemanticRelation()); } auto recoData=static_cast<RecognizerData*>( analysis.getData("RecognizerData")); for (auto i = m_relations.begin(); i != m_relations.end(); i++) { LinguisticGraphVertex vertex1 = i->get<0>(); LinguisticGraphVertex vertex2 = i->get<1>(); auto matchesVtx1 = annotationData->matches(recoData->getGraphId(), vertex1, "annot"); auto matchesVtx2 = annotationData->matches(recoData->getGraphId(), vertex2, "annot"); if (!annotationData->hasAnnotation(*(matchesVtx1.begin()), *(matchesVtx2.begin()), "SemanticRelation")) { SemanticRelationAnnotation annot(i->get<2>()); GenericAnnotation ga(annot); annotationData->annotate(*(matchesVtx1.begin()), *(matchesVtx2.begin()), "SemanticRelation", ga); } else { auto annot = annotationData->annotation(*(matchesVtx1.begin()), *(matchesVtx2.begin()), "SemanticRelation").pointerValue<SemanticRelationAnnotation>(); SEMANTICANALYSISLOGINIT; LWARN << "SemanticRelationData::addRelations There is already a SemanticRelation between" << *(matchesVtx1.begin()) << "and" << *(matchesVtx2.begin()) << annot->type(); LWARN << "Adding new type" << i->get<2>(); QString type = QString::fromUtf8(annot->type().c_str()); QStringList typeList = type.split(','); typeList << i->get<2>().c_str(); typeList.sort(); typeList.removeDuplicates(); annot->type(typeList.join(',').toUtf8().constData()); LWARN << "Adding type is now" << annot->type(); } } m_relations.clear(); return true; }
// Each token of the specified path is // searched into the specified dictionary. LimaStatusCode DotGraphWriter::process(AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData(m_graphId)); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { PTLOGINIT; LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } if (anagraph == 0) { PTLOGINIT; LERROR << "no AnalysisGraph named " << m_graphId << " ! "; return MISSING_DATA; } string outputFileName=metadata->getMetaData("FileName") + m_outputSuffix; PosTagger::PosTaggingGraphWriter gw( anagraph->getGraph(), m_language, m_trigramMatrix, m_bigramMatrix); gw.setOptions(m_graphDotOptions,m_nodeDotOptions,m_edgeDotOptions); gw.writeToDotFile(outputFileName,m_vertexDisplay); TimeUtils::logElapsedTime("DotGraphWriter"); return SUCCESS_ID; }
LimaStatusCode EntityTrackerXmlLogger::process( AnalysisContent& analysis) const { SELOGINIT; LDEBUG << "EntityTrackerXmlLogger::process"; TimeUtils::updateCurrentTime(); /* permet de récupérer les annotations */ //AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); /* recupérer le graph après l'analyse */ //const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph))); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { SELOGINIT; LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } CoreferenceData* corefData=static_cast<CoreferenceData*>(analysis.getData("CoreferenceData")); if (corefData == 0) { SELOGINIT; LERROR << "no CoreferenceData ! abort"; return MISSING_DATA; } ofstream out; if (!openLogFile(out,metadata->getMetaData("FileName"))) { SELOGINIT; LERROR << "Can't open log file '" << metadata->getMetaData("FileName") << "'"; return UNKNOWN_ERROR; } out << "<coreference>" << endl; for (CoreferenceData::const_iterator it=corefData->begin(), it_end=corefData->end(); it != it_end; it++) { out << "<entity mentions=\"" << (*it).size() << "\">" << endl; for (vector<Token>::const_iterator it2=(*it).begin(), it2_end=(*it).end(); it2 != it2_end; it2++) { out << " <entity_mention>" << limastring2utf8stdstring((*it2).stringForm()) <<"</entity_mention>"; } out << "<entity>" <<endl; } out.close(); return SUCCESS_ID; }
LimaStatusCode LinearTextRepresentationDumper::process( AnalysisContent& analysis) const { DUMPERLOGINIT; // get metadata LinguisticMetaData* metadata=dynamic_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "LinearTextRepresentationDumper::process: no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } // get the analysis graph AnalysisGraph* anaGraph = dynamic_cast<AnalysisGraph*>(analysis.getData("PosGraph")); if (anaGraph == 0) { LERROR << "LinearTextRepresentationDumper::process: no AnalysisGraph ! abort" << LENDL; return MISSING_DATA; } // get sentence boundaries SegmentationData* sb = dynamic_cast<SegmentationData*>(analysis.getData("SentenceBoundaries")); if (sb == 0) { LERROR << "LinearTextRepresentationDumper::process: no SentenceBounds ! abort" << LENDL; return MISSING_DATA; } // build LTRText LTR_Text textRep; LTRTextBuilder builder(m_language, m_stopList); builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); // write LTR_Text LDEBUG << "handler will be: " << m_handler << LENDL; // MediaId langid = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia(); AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler)); if (handler == 0) { LERROR << "LinearTextRepresentationDumper::process: handler " << m_handler << " has not been given to the core client" << LENDL; return MISSING_DATA; } handler->startAnalysis(); HandlerStreamBuf hsb(handler); ostream out(&hsb); LDEBUG << textRep << LENDL; textRep.binaryWriteOn(out); out.flush(); handler->endAnalysis(); return SUCCESS_ID; }
LimaStatusCode EnchantSpellingAlternatives::process(AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); MORPHOLOGINIT; LINFO << "MorphologicalAnalysis: starting process EnchantSpellingAlternatives"; FsaStringsPool& sp=Common::MediaticData::MediaticData::changeable().stringsPool(m_d->m_language); AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")); LinguisticGraph* g=tokenList->getGraph(); VertexDataPropertyMap dataMap=get(vertex_data,*g); VertexTokenPropertyMap tokenMap=get(vertex_token,*g); LinguisticGraphVertexIt it,itEnd; for (boost::tie(it,itEnd)=vertices(*g) ; it != itEnd ; it++) { LDEBUG << "EnchantSpellingAlternatives::process processing vertex " << *it; Token* currentToken=tokenMap[*it]; MorphoSyntacticData* msd=dataMap[*it]; if (currentToken!=0) { if (msd->empty()) { m_d->setEnchantSpellingAlternatives( currentToken, msd, sp); } } } LINFO << "MorphologicalAnalysis: ending process EnchantSpellingAlternatives"; return SUCCESS_ID; }
LimaStatusCode OrthographicAlternatives::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); MORPHOLOGINIT; LINFO << "MorphologicalAnalysis: starting process OrthographicAlternatives"; StringsPool& sp=Common::LinguisticData::LinguisticData::changeable().stringsPool(m_language); AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")); LinguisticGraph* g=tokenList->getGraph(); LinguisticGraphVertexIt it,itEnd; VertexDataPropertyMap dataMap=get(vertex_data,*g); VertexTokenPropertyMap tokenMap=get(vertex_token,*g); boost::tie(it,itEnd)=vertices(*g); for (;it!=itEnd;it++) { LDEBUG << "processing vertex " << *it; MorphoSyntacticData* currentTokenData=dataMap[*it]; Token* tok=tokenMap[*it]; if (currentTokenData!=0) { // if in confidentMode and token has already ling infos, skip if ( m_confidentMode && (currentTokenData->size()>0) ) continue; // set orthographic alternatives given by dictionary // using the alternatives directly given by the morphosyntactic data { LDEBUG << "processing alternatives from dico"; DictionaryEntry* entry=tok->dictionaryEntry(); entry->reset(); if (entry->hasAccented()) { LimaString oa = entry->nextAccented(); while ( oa.size() > 0 ) { createAlternative(tok,currentTokenData,oa,m_dictionary,sp); oa = entry->nextAccented(); } } } // if in confidentMode and token has already ling infos, skip if (m_confidentMode && (currentTokenData->size() > 0) ) continue; // if no ling infos, then lower and unmark string LDEBUG << "set unmark alternatives"; setOrthographicAlternatives( tok, currentTokenData, m_dictionary, m_charChart, sp); } } LINFO << "MorphologicalAnalysis: ending process OrthographicAlternatives"; TimeUtils::logElapsedTime("OrthographicAlternatives"); return SUCCESS_ID; }
LimaStatusCode SpecificEntitiesLoader:: process(AnalysisContent& analysis) const { // get analysis graph AnalysisGraph* graph=static_cast<AnalysisGraph*>(analysis.getData(m_graph)); if (graph==0) { LOGINIT("LP::SpecificEntities"); LERROR << "no graph '" << m_graph << "' available !"; return MISSING_DATA; } //create a RecognizerData (such as in ApplyRecognizer) to be able to use //CreateSpecificEntity actions RecognizerData* recoData=new RecognizerData; analysis.setData("RecognizerData",recoData); RecognizerResultData* resultData=new RecognizerResultData(m_graph); recoData->setResultData(resultData); try { SpecificEntitiesLoader::XMLHandler handler(m_language,analysis,graph); m_parser->setContentHandler(&handler); m_parser->setErrorHandler(&handler); QFile file(getInputFile(analysis).c_str()); if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) throw XMLException(); if (!m_parser->parse( QXmlInputSource(&file))) { throw XMLException(); } } catch (const XMLException& ) { LOGINIT("LP::SpecificEntities"); LERROR << "Error: failed to parse XML input file"; } // remove recognizer data (used only internally to this process unit) recoData->deleteResultData(); resultData=0; analysis.removeData("RecognizerData"); return SUCCESS_ID; }
LimaStatusCode LinearTextRepresentationLogger::process( AnalysisContent& analysis) const { DUMPERLOGINIT; // get metadata LinguisticMetaData* metadata=dynamic_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } // get the analysis graph AnalysisGraph* anaGraph = dynamic_cast<AnalysisGraph*>(analysis.getData("PosGraph")); if (anaGraph == 0) { LERROR << "no AnalysisGraph ! abort"; return MISSING_DATA; } // get sentence boundaries SegmentationData* sb = dynamic_cast<SegmentationData*>(analysis.getData("SentenceBoundaries")); if (sb == 0) { LDEBUG << "LinearTextRepresentationDumper::process: no SentenceBounds available: ignored"; // sentence bounds ignored: null pointer passed to LTRTextBuilder will be handled there } // build LTRText LTR_Text textRep; LTRTextBuilder builder(m_language, m_stopList); builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, anaGraph->firstVertex(), anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); // write LTR_Text string textFileName = metadata->getMetaData("FileName"); string outputFile = textFileName + m_outputSuffix; ofstream out(outputFile.c_str(), std::ofstream::binary); if (!out.good()) { throw runtime_error("can't open file " + outputFile); } textRep.binaryWriteOn(out); out.flush(); out.close(); return SUCCESS_ID; }
bool CreateSemanticRelation:: operator()(const LinguisticAnalysisStructure::AnalysisGraph& anagraph, const LinguisticGraphVertex& vertex1, const LinguisticGraphVertex& vertex2, AnalysisContent& analysis ) const { LIMA_UNUSED(anagraph); SemanticRelationData * semanticData=static_cast<SemanticRelationData*>(analysis.getData("SemanticRelationData")); if (semanticData==0) { semanticData=new SemanticRelationData(); analysis.setData("SemanticRelationData",semanticData); } return semanticData->relation(vertex1,vertex2,m_semanticRelationType); }
void SegmentFeatureEntity:: update(const AnalysisContent& analysis) { m_annotationData = static_cast<const AnnotationData*>(analysis.getData("AnnotationData")); if (m_annotationData==0) { LOGINIT("LP::Segmentation"); LERROR << "no annotation graph available !" << LENDL; } }
void SegmentFeatureRank:: update(const AnalysisContent& analysis) { // store information from segmentation data to know at which segment we are const AnalysisData* data=analysis.getData(m_data); if (data==0) { return; } m_segmData=static_cast<const SegmentationData*>(data); }
LimaStatusCode EventTemplateDataDumper::process(AnalysisContent& analysis) const { LOGINIT("LP::EventAnalysis"); LDEBUG << "EventTemplateDataDumper::process" << LENDL; TimeUtils::updateCurrentTime(); // initialize output DumperStream* dstream=AbstractTextualAnalysisDumper::initialize(analysis); ostream& out=dstream->out(); const AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData==0) { LERROR << "no annotation graph available !" << LENDL; return MISSING_DATA; } if (! m_eventData.empty()) { const AnalysisData* data =analysis.getData(m_eventData); if (data!=0) { // see if the data is of type Events const EventTemplateData* eventData=dynamic_cast<const EventTemplateData*>(data); if (eventData==0) { LOGINIT("LP::EventAnalysis"); LERROR << "data '" << m_eventData << "' is neither of type EventData nor Events" << LENDL; return MISSING_DATA; } else { Events *events=eventData->convertToEvents(annotationData); events->write(out); } } else { LOGINIT("LP::EventAnalysis"); LERROR << "no data of name " << m_eventData << LENDL; } } delete dstream; TimeUtils::logElapsedTime("EventTemplateDataDumper"); return SUCCESS_ID; }
bool SaveSemanticRelation::operator()(AnalysisContent& analysis ) const { SemanticRelationData * semanticData=static_cast<SemanticRelationData*>(analysis.getData("SemanticRelationData")); if (semanticData==0) { return false; } else { return semanticData->addRelations(analysis); } }
void SegmentFeatureInSegment:: update(const AnalysisContent& analysis) { const AnalysisData* data=analysis.getData(m_segmentData); if (data==0) { LOGINIT("LP::Segmentation"); LERROR << SegmentFeatureInSegment_ID << ": No data " << m_segmentData << LENDL; m_data=0; } else { m_data=static_cast<const SegmentationData*>(data); } }
// Datas are extracted from word sense annotations and written on the xml file according to the given dtd format LimaStatusCode WordSenseXmlLogger::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LOGINIT("WordSenseDisambiguator"); LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } string textFileName = metadata->getMetaData("FileName"); string outputFile = textFileName + m_outputSuffix; ofstream out(outputFile.c_str(), std::ofstream::binary); if (!out.good()) { throw runtime_error("can't open file " + outputFile); } AnalysisGraph* /*anagraph=static_cast<AnalysisGraph*>(analysis.getData("SimpleGraph")); if (anagraph==0)*/ anagraph=static_cast<AnalysisGraph*>(analysis.getData("PosGraph")); if (anagraph==0) { LOGINIT("WordSenseDisambiguator"); LERROR << "no AnalysisGraph ! abort"; return MISSING_DATA; } dump(out, anagraph,/* static_cast<SyntacticData*>(analysis.getData("SyntacticData")),*/ static_cast<AnnotationData*>(analysis.getData("AnnotationData"))); out.flush(); out.close(); TimeUtils::logElapsedTime("WordSenseDisambiguatorXmlLogger"); return SUCCESS_ID; }
bool SaveSemanticRelation::operator()(AnalysisContent& analysis ) const { #ifdef DEBUG_LP SEMLOGINIT; LDEBUG << "SaveSemanticRelation::operator()"; #endif SemanticRelationData * semanticData=static_cast<SemanticRelationData*>(analysis.getData("SemanticRelationData")); if (semanticData==0) { return false; } else { return semanticData->addRelations(analysis); } }
void SegmentFeatureEntityInData:: update(const AnalysisContent& analysis) { // get result data const AnalysisData* resultData=analysis.getData(m_dataName); if (resultData == 0) { LOGINIT("LP::Segmentation"); LERROR << "no data " << m_data << "in AnalysisContent" << LENDL; } m_data=dynamic_cast<const ApplyRecognizer::RecognizerResultData*>(resultData); if (m_data == 0) { LOGINIT("LP::Segmentation"); LERROR << "data " << m_data << "in AnalysisContent is not a RecognizerResultData" << LENDL; } }
void SegmentFeaturePosition:: update(const AnalysisContent& analysis) { // update offset from metadata const LinguisticMetaData* metadata=static_cast<const LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LOGINIT("LP::Segmentation"); LWARN << "no LinguisticMetaData ! abort" << LENDL; } else { try { m_offset=atoi(metadata->getMetaData("StartOffset").c_str()); } catch (LinguisticProcessingException& ) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } } }
bool EntityGroupTransition:: compare(const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& v, AnalysisContent& analysis, const LinguisticAnalysisStructure::Token* /*token*/, const LinguisticAnalysisStructure::MorphoSyntacticData* /*data*/) const { // should compare to vertex ? AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData==0) { AULOGINIT; LDEBUG << "EntityGroupTransition::compare: no annotation graph available !"; return false; } // find annotationGraphVertex matching the vertex of the current graph std::set<AnnotationGraphVertex> matches = annotationData->matches(graph.getGraphId(), v, "annot"); if (matches.empty()) { AULOGINIT; LDEBUG << "annotation ("<<graph.getGraphId()<<", "<<v<<", \"annot\") available"; return false; } AnnotationGraphVertex annotVertex = *(matches.begin()); if (!annotationData->hasAnnotation(annotVertex, m_entityAnnotation)) { AULOGINIT; LDEBUG << "EntityGroupTransition::compare: No " << m_entityAnnotation << " annotation available on " << v; return false; } const SpecificEntityAnnotation* se = annotationData->annotation(annotVertex, m_entityAnnotation). pointerValue<SpecificEntityAnnotation>(); Common::MediaticData::EntityType type = se->getType(); AULOGINIT; LDEBUG << "EntityGroupTransition::compare: type = " << type << ", groupId = " << type.getGroupId(); LDEBUG << "EntityGroupTransition::compare: m_entityGroupId = " << m_entityGroupId; LDEBUG << "EntityGroupTransition::compare: tests m_entityGroupId == type.getGroupId() = " << (m_entityGroupId == type.getGroupId()); return( m_entityGroupId == type.getGroupId() ); }
bool StoreForDisambiguation::operator()( const LinguisticAnalysisStructure::AnalysisGraph& graph, const LinguisticGraphVertex& v1, const LinguisticGraphVertex& v2, AnalysisContent& analysis ) const { /* Critical Function : comment logging messages */ SyntacticData* syntacticData=static_cast<SyntacticData*>(analysis.getData("SyntacticData")); if (v1 == graph.firstVertex() || v1 == graph.lastVertex() || v2 == graph.firstVertex() || v2 == graph.lastVertex() ) { // LDEBUG << "SecondUngovernedBy: false" << LENDL; return false; } SAPLOGINIT; LDEBUG << "StoreForDisambiguation " << v1 << ", " << v2 << ", " << m_relation << LENDL; syntacticData->storeRelationForSelectionalConstraint(v1, v2, m_relation); return true; }
LimaStatusCode SpecificEntitiesXmlLogger::process( AnalysisContent& analysis) const { SELOGINIT; LDEBUG << "SpecificEntitiesXmlLogger::process"; TimeUtils::updateCurrentTime(); AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData == 0) { SELOGINIT; LERROR << "no annotationData ! abort"; return MISSING_DATA; } LinguisticAnalysisStructure::AnalysisGraph* graphp = static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph)); if (graphp == 0) { SELOGINIT; LERROR << "no graph "<< m_graph <<" ! abort"; return MISSING_DATA; } const LinguisticAnalysisStructure::AnalysisGraph& graph = *graphp; LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph()); VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { SELOGINIT; LERROR << "no LinguisticMetaData ! abort"; return MISSING_DATA; } DumperStream* dstream=initialize(analysis); ostream& out=dstream->out(); uint64_t offset(0); try { offset=atoi(metadata->getMetaData("StartOffset").c_str()); } catch (LinguisticProcessingException& ) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } uint64_t offsetIndexingNode(0); try { offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str()); } catch (LinguisticProcessingException& ) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } std::string docId(""); try { docId=metadata->getMetaData("DocId"); } catch (LinguisticProcessingException& ) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } if (m_compactFormat) { out << "<entities docid=\"" << docId << "\" offsetNode=\"" << offsetIndexingNode << "\" offset=\"" << offset << "\">" << endl; } else { out << "<specific_entities>" << endl; } // SELOGINIT; if (m_followGraph) { // instead of looking to all annotations, follow the graph (in // morphological graph, some vertices are not related to main graph: // idiomatic expressions parts and named entity parts) // -> this will not include nested entities AnalysisGraph* tokenList=static_cast<AnalysisGraph*>(analysis.getData(m_graph)); if (tokenList==0) { LERROR << "graph " << m_graph << " has not been produced: check pipeline"; return MISSING_DATA; } LinguisticGraph* graph=tokenList->getGraph(); //const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); std::queue<LinguisticGraphVertex> toVisit; std::set<LinguisticGraphVertex> visited; toVisit.push(tokenList->firstVertex()); LinguisticGraphOutEdgeIt outItr,outItrEnd; while (!toVisit.empty()) { LinguisticGraphVertex v=toVisit.front(); toVisit.pop(); if (v == tokenList->lastVertex()) { continue; } for (boost::tie(outItr,outItrEnd)=out_edges(v,*graph); outItr!=outItrEnd; outItr++) { LinguisticGraphVertex next=target(*outItr,*graph); if (visited.find(next)==visited.end()) { visited.insert(next); toVisit.push(next); } } const SpecificEntityAnnotation* annot=getSpecificEntityAnnotation(v,annotationData); if (annot != 0) { outputEntity(out,v,annot,tokenMap,offset); } } } else { // take all annotations AnnotationGraphVertexIt itv, itv_end; boost::tie(itv, itv_end) = vertices(annotationData->getGraph()); for (; itv != itv_end; itv++) { // LDEBUG << "SpecificEntitiesXmlLogger on annotation vertex " << *itv; if (annotationData->hasAnnotation(*itv,Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { // LDEBUG << " it has SpecificEntityAnnotation"; const SpecificEntityAnnotation* annot = 0; try { annot = annotationData->annotation(*itv,Common::Misc::utf8stdstring2limastring("SpecificEntity")) .pointerValue<SpecificEntityAnnotation>(); } catch (const boost::bad_any_cast& ) { SELOGINIT; LERROR << "This annotation is not a SpecificEntity; SE not logged"; continue; } // recuperer l'id du vertex morph cree LinguisticGraphVertex v; if (!annotationData->hasIntAnnotation(*itv,Common::Misc::utf8stdstring2limastring(m_graph))) { // SELOGINIT; // LDEBUG << *itv << " has no " << m_graph << " annotation. Skeeping it."; continue; } v = annotationData->intAnnotation(*itv,Common::Misc::utf8stdstring2limastring(m_graph)); outputEntity(out,v,annot,tokenMap,offset); } } } // LDEBUG << " all vertices done"; if (m_compactFormat) { out << "</entities>" << endl; } else { out << "</specific_entities>" << endl; } delete dstream; TimeUtils::logElapsedTime("SpecificEntitiesXmlLogger"); return SUCCESS_ID; }
LimaStatusCode SimpleEventBuilder::process(AnalysisContent& analysis) const { EVENTANALYZERLOGINIT; TimeUtils::updateCurrentTime(); LDEBUG << "start SimpleEventBuilder" << LENDL; // get annotation data (for entities) AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData==0) { LERROR << "no annotation graph available !" << LENDL; return MISSING_DATA; } // get segmentation data AnalysisData* data=analysis.getData(m_segmData); if (data==0) { LERROR << "Missing data '" << m_segmData << "'" << LENDL; return MISSING_DATA; } SegmentationData* segmData=static_cast<SegmentationData*>(data); if (segmData==0) { LERROR << "Failed to interpret data '" << m_segmData << "' as SegmentationData" << LENDL; return MISSING_DATA; } // get graph on which the segmentation data relies string graphId=segmData->getGraphId(); AnalysisGraph* graph=static_cast<AnalysisGraph*>(analysis.getData(graphId)); if (graph==0) { LERROR << "Cannot get graph '" << graphId << "' (from segmentation data)" << LENDL; return MISSING_DATA; } EventData* eventData=new EventData; LDEBUG << "set new data EventData of type EventData" << LENDL; analysis.setData("EventData", eventData); // get entities map<Common::MediaticData::EntityType,vector<Entity> >& entities=eventData->getEntities(); // ??OME2 for (SegmentationData::const_iterator it=segmData->begin(),it_end=segmData->end();it!=it_end;it++) { for (std::vector<Segment>::const_iterator it=(segmData->getSegments()).begin(),it_end=(segmData->getSegments()).end();it!=it_end;it++) { if ((*it).getType()==m_segmentType) { LDEBUG << "in segment " << m_segmentType << " [" << (*it).getPosBegin() << "," << (*it).getLength() << "]" << LENDL; // get entities in this segment getEntitiesFromSegment(entities,graph,(*it).getFirstVertex(),(*it).getLastVertex(),annotationData); LDEBUG << "found " << entities.size() << " entities" << LENDL; } else { LDEBUG << "ignored segment " << (*it).getType() << LENDL; } } // choose main entities : take first for (map<Common::MediaticData::EntityType,vector<Entity> >::iterator it=entities.begin(), it_end=entities.end();it!=it_end;it++) { if ((*it).second.size()!=0) { LDEBUG << "set main for entity of type " << (*it).first << " at pos " << (*it).second[0].getPosition() << LENDL; (*it).second[0].setMain(true); } } TimeUtils::logElapsedTime("SimpleEventBuilder"); return SUCCESS_ID; }
LimaStatusCode SegmentationDataXmlLogger::process( AnalysisContent& analysis) const { LOGINIT("LP::Segmentation"); LDEBUG << "SegmentationDataXmlLogger::process" << LENDL; TimeUtils::updateCurrentTime(); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LOGINIT("LP::Segmentation"); LERROR << "no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } // open output file ofstream out; if (!openLogFile(out,metadata->getMetaData("FileName"))) { LOGINIT("LP::Segmentation"); LERROR << "Can't open log file '" << metadata->getMetaData("FileName") << "'" << LENDL; return UNKNOWN_ERROR; } // get metadata (useful for XML documents) // uint64_t offset(0); // uint64_t offsetIndexingNode(0); std::string docId(""); try { // offset=atoi(metadata->getMetaData("StartOffset").c_str()); // offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str()); docId=metadata->getMetaData("DocId"); } catch (LinguisticProcessingException& ) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } // log out << "<segmentation>" << endl; const AnalysisData* data =analysis.getData(m_data); if (data!=0) { const SegmentationData* segData=static_cast<const SegmentationData*>(data); const vector<Segment> seg=segData->getSegments(); for (vector<Segment>::const_iterator it=seg.begin(), it_end=seg.end(); it!=it_end; it++) { out << "<segment>" << "<pos>" << (*it).getPosBegin() << "</pos>" << "<len>" << (*it).getLength() << "</len>" << "<type>" << (*it).getType() << "</type>" << "</segment>" << endl; } } else { LOGINIT("LP::Segmentation"); LDEBUG << "no SegmentationData of name " << m_data << LENDL; } out << "</segmentation>" << endl; out.close(); TimeUtils::logElapsedTime("SegmentationDataXmlLogger"); return SUCCESS_ID; }
std::string FeatureLemmaSpecificEntity:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, LinguisticGraphVertex v, AnalysisContent &analysis ) const { std::string mxvalue("NAN"); Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); std::set< AnnotationGraphVertex > matches = annot->matches(graph->getGraphId(),v,"annot"); for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) { if (annot->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { AnnotationGraphVertex vx=*it; const SpecificEntityAnnotation* se = annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). pointerValue<SpecificEntityAnnotation>(); LimaString str= Common::MediaticData::MediaticData::single().getEntityName(se->getType()); mxvalue=Common::Misc::limastring2utf8stdstring(str); } } // replace NAN values by lemmas if (mxvalue == "NAN") { MorphoSyntacticData* data=get(vertex_data,*(graph->getGraph()),v); // take first for (MorphoSyntacticData::const_iterator it=data->begin(),it_end=data->end();it!=it_end;it++) { mxvalue = Common::Misc::limastring2utf8stdstring((*&(Common::MediaticData::MediaticData::single().stringsPool(m_language)))[(*it).normalizedForm]); break; } } // replace empty lemma values by tokens if (mxvalue == "" ) { Token* token=get(vertex_token,*(graph->getGraph()),v); mxvalue = Common::Misc::limastring2utf8stdstring(token->stringForm()); } return mxvalue; }
LimaStatusCode ParagraphBoundariesFinder::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); SENTBOUNDLOGINIT; LINFO << "start finding paragraph founds"; // find paragraphs in text (positions of double carriage returns), // then find corresponding vertices in graph AnalysisGraph* graph=static_cast<AnalysisGraph*>(analysis.getData(m_graph)); if (graph==0) { LERROR << "no graph '" << m_graph << "' available !"; return MISSING_DATA; } SegmentationData* boundaries=new SegmentationData(m_graph); analysis.setData("ParagraphBoundaries",boundaries); LimaStringText* text=static_cast<LimaStringText*>(analysis.getData("Text")); std::vector<uint64_t> paragraphPositions; int currentPos=0; int i=text->indexOf(m_paragraphSeparator,currentPos); while (i!=-1) { paragraphPositions.push_back((uint64_t)i); // goto next char that is not a carriage return currentPos=text->indexOf(QRegExp(QString(QLatin1String("[^%1]")).arg(m_paragraphSeparator)),i+1); i=text->indexOf(m_paragraphSeparator,currentPos); } if (paragraphPositions.empty()) { LWARN << "no paragraph found"; return SUCCESS_ID; } // find vertices related to positions in graph uint64_t parNum=0; std::deque<LinguisticGraphVertex> toVisit; std::set<LinguisticGraphVertex> visited; LinguisticGraphVertex beginParagraph=graph->firstVertex(); toVisit.push_back(graph->firstVertex()); visited.insert(graph->firstVertex()); while (!toVisit.empty()) { LinguisticGraphVertex currentVertex=toVisit.front(); toVisit.pop_front(); if (currentVertex == graph->lastVertex()) { // end of the graph continue; // may be other nodes to test in queue } if (currentVertex != graph->firstVertex()) { Token* t = get(vertex_token,*(graph->getGraph()),currentVertex); uint64_t position=t->position(); if (position >= (paragraphPositions[parNum]+1)) { boundaries->add(Segment("paragraph",beginParagraph,currentVertex,graph)); beginParagraph=currentVertex; parNum++; if (parNum >= paragraphPositions.size()) { break; } } } // store following nodes to test LinguisticGraphOutEdgeIt outEdge,outEdge_end; boost::tie(outEdge,outEdge_end)=out_edges(currentVertex,*(graph->getGraph())); for (; outEdge!=outEdge_end; outEdge++) { LinguisticGraphVertex next=target(*outEdge,*(graph->getGraph())); if (visited.find(next)==visited.end()) { toVisit.push_back(next); visited.insert(next); } } } TimeUtils::logElapsedTime("ParagraphBoundariesFinder"); return SUCCESS_ID; }
LimaStatusCode ExampleLoader::process(AnalysisContent& analysis) const { // get linguistic graph AnalysisGraph* anaGraph=static_cast<AnalysisGraph*>(analysis.getData("PosGraph")); LinguisticGraph* lingGraph=anaGraph->getGraph(); if (lingGraph==0) { PROCESSORSLOGINIT; LERROR << "no graph 'PosGraph' available !"; return MISSING_DATA; } else{ try{ ExampleLoader::XMLHandler handler(m_language,analysis,anaGraph); m_parser->setContentHandler(&handler); m_parser->setErrorHandler(&handler); QFile file("/tmp/mm-lp.morphoSyntacticalAnalysis-changed.tmp"); if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) throw XMLException(); if (!m_parser->parse( QXmlInputSource(&file))) { throw XMLException(); } LinguisticGraph::vertex_iterator vxItr,vxItrEnd; boost::tie(vxItr,vxItrEnd) = boost::vertices(*lingGraph); for (;vxItr!=vxItrEnd;vxItr++){ MorphoSyntacticData* morphoData=get(vertex_data,*lingGraph, *vxItr); Token* ft=get(vertex_token,*lingGraph,*vxItr); if( ft!=0){ const QString tag=QString::fromStdString(static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(handler.m_tagIndex[ft->position()])); const Common::PropertyCode::PropertyCodeManager& codeManager=static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager(); const Common::PropertyCode::PropertyAccessor m_propertyAccessor=codeManager.getPropertyAccessor("MICRO"); const QString graphTag=QString::fromStdString(static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->firstValue(m_propertyAccessor))); cout << " la premiere categorie de " << ft->stringForm() << " est " << graphTag << endl; //si différence entre valeur de la map et noeud du graphe à la position n, remplacer la valeur du noeud //par la valeur de la map if(tag!=graphTag){ const QString tagBefore=QString::fromStdString(static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->at(0).properties)); cout << "le token a la position " << ft->position() << " passe de " << morphoData->at(0).properties << endl; morphoData->at(0).properties=handler.m_tagIndex[ft->position()]; cout << " a la position " << morphoData->at(0).properties << endl; const QString tagAfter=QString::fromStdString(static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyManager("MICRO").getPropertySymbolicValue(morphoData->at(0).properties)); cout << "Et la chaîne passe de " << tagBefore << " à " << tagAfter << endl; //LinguisticCode lc = morphoData->at(0).properties; put(vertex_data, *lingGraph, *vxItr, morphoData); cout << " a la position " << morphoData->at(0).properties << endl; } } } } catch (const XMLException& ){ PROCESSORSLOGINIT; LERROR << "Error: failed to parse XML input file"; } return SUCCESS_ID; } }
LimaStatusCode SyntacticAnalysisXmlLogger::process( AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { SALOGINIT; LERROR << "no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } DumperStream* dstream=initialize(analysis); std::ostream& outputStream=dstream->out(); /*std::ofstream outputStream; if (!openLogFile(outputStream,metadata->getMetaData("FileName"))) { SALOGINIT; LERROR << "Can't open log file " << LENDL; return CANNOT_OPEN_FILE_ERROR; }*/ SALOGINIT; const SyntacticData* syntacticData=static_cast<const SyntacticData*>(analysis.getData("SyntacticData")); if (syntacticData==0) { LERROR << "no SyntacticData ! abort" << LENDL; return MISSING_DATA; } AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData("PosGraph")); if (anagraph==0) { LERROR << "no AnalysisGraph ! abort" << LENDL; return MISSING_DATA; } SegmentationData* sb=static_cast<SegmentationData*>(analysis.getData("SentenceBoundaries")); if (sb==0) { LERROR << "no SentenceBounds ! abort" << LENDL; return MISSING_DATA; } // LinguisticGraph* graph=anagraph->getGraph(); std::set< std::pair<size_t, size_t> > alreadyDumped; outputStream << "<?xml version='1.0' encoding='UTF-8'?>" << std::endl; outputStream << "<syntactic_analysis_dump>" << std::endl; // ??OME2 SegmentationData::iterator sbItr=sb->begin(); std::vector<Segment>::iterator sbItr=(sb->getSegments()).begin(); while (sbItr!=(sb->getSegments()).end()) { LinguisticGraphVertex beginSentence=sbItr->getFirstVertex(); LinguisticGraphVertex endSentence=sbItr->getLastVertex(); dumpLimaData(outputStream, beginSentence, endSentence, anagraph, syntacticData); sbItr++; } outputStream << "</syntactic_analysis_dump>" << std::endl; delete dstream; TimeUtils::logElapsedTime("SyntacticAnalysisXmlLogger"); return SUCCESS_ID; }
LimaStatusCode SemanticRelationsXmlLogger:: process(AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); SEMLOGINIT; LERROR << "SemanticRelationsXmlLogger" << LENDL; AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph))); LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph()); VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { SEMLOGINIT; LERROR << "no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } ofstream out; if (!openLogFile(out,metadata->getMetaData("FileName"))) { SEMLOGINIT; LERROR << "Can't open log file " << LENDL; return UNKNOWN_ERROR; } uint64_t offset(0); try { offset=atoi(metadata->getMetaData("StartOffset").c_str()); } catch (LinguisticProcessingException& e) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } uint64_t offsetIndexingNode(0); try { offsetIndexingNode=atoi(metadata->getMetaData("StartOffsetIndexingNode").c_str()); } catch (LinguisticProcessingException& e) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } std::string docId(""); try { docId=metadata->getMetaData("DocId"); } catch (LinguisticProcessingException& e) { // do nothing: not set in analyzeText (only in analyzeXmlDocuments) } out << "<relations docid=\"" << docId << "\" offsetNode=\"" << offsetIndexingNode << "\">" << endl; // LDEBUG << "SemanticRelationsXmlLogger on graph " << m_graph << LENDL; //look at all vertices for annotations AnnotationGraphVertexIt itv, itv_end; boost::tie(itv, itv_end) = vertices(annotationData->getGraph()); for (; itv != itv_end; itv++) { LDEBUG << "SemanticRelationsXmlLogger on annotation vertex " << *itv << LENDL; if (annotationData->hasAnnotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation"))) { // LDEBUG << " it has SemanticRelationAnnotation" << LENDL; const SemanticAnnotation* annot = 0; try { annot = annotationData->annotation(*itv,Common::Misc::utf8stdstring2limastring("SemanticAnnotation")) .pointerValue<SemanticAnnotation>(); } catch (const boost::bad_any_cast& e) { SEMLOGINIT; LERROR << "This annotation is not a SemanticRelation" << LENDL; continue; } // output out << "<annotation type=\"" << annot->getType() << "\">" << endl << vertexStringForSemanticAnnotation("vertex",*itv,tokenMap,annotationData,offset) << "</annotation>" << endl; } } // look at all edges for relations AnnotationGraphEdgeIt it,it_end; const AnnotationGraph& annotGraph=annotationData->getGraph(); boost::tie(it, it_end) = edges(annotGraph); for (; it != it_end; it++) { LDEBUG << "SemanticRelationsXmlLogger on annotation edge " << source(*it,annotGraph) << "->" << target(*it,annotationData->getGraph()) << LENDL; if (annotationData->hasAnnotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation"))) { SEMLOGINIT; LDEBUG << "found semantic relation" << LENDL; const SemanticRelationAnnotation* annot = 0; try { annot = annotationData->annotation(*it,Common::Misc::utf8stdstring2limastring("SemanticRelation")) .pointerValue<SemanticRelationAnnotation>(); } catch (const boost::bad_any_cast& e) { SEMLOGINIT; LERROR << "This annotation is not a SemanticAnnotation" << LENDL; continue; } //output out << "<relation type=\"" << annot->type() << "\">" << endl << vertexStringForSemanticAnnotation("source",source(*it,annotGraph),tokenMap,annotationData,offset) << vertexStringForSemanticAnnotation("target",target(*it,annotGraph),tokenMap,annotationData,offset) << "</relation>" << endl; } } // LDEBUG << " all vertices done" << LENDL; out << "</relations>" << endl; out.close(); TimeUtils::logElapsedTime("SemanticRelationsXmlLogger"); return SUCCESS_ID; }
std::string FeatureSpecificEntity:: getValue(const LinguisticAnalysisStructure::AnalysisGraph* graph, LinguisticGraphVertex v, AnalysisContent &analysis ) const { std::string typeName("NAN"); Common::AnnotationGraphs::AnnotationData *annot = static_cast< Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); std::set< AnnotationGraphVertex > matches = annot->matches(graph->getGraphId(),v,"annot"); for (std::set< AnnotationGraphVertex >::const_iterator it = matches.begin(); it != matches.end(); it++) { if (annot->hasAnnotation(*it, Common::Misc::utf8stdstring2limastring("SpecificEntity"))) { AnnotationGraphVertex vx=*it; const SpecificEntityAnnotation* se = annot->annotation(vx, Common::Misc::utf8stdstring2limastring("SpecificEntity")). pointerValue<SpecificEntityAnnotation>(); LimaString str= Common::MediaticData::MediaticData::single().getEntityName(se->getType()); typeName=Common::Misc::limastring2utf8stdstring(str); } } return typeName; }