void DumpXMLAnnotationVisitor::examine_edge(LinguisticGraphEdge e, const LinguisticGraph& g) { LinguisticGraphVertex v = target(e, g); // process if (m_ad->hasAnnotation(v, Common::Misc::utf8stdstring2limastring("WordSense"))) { GenericAnnotation ga = (m_ad->annotation(v,utf8stdstring2limastring("WordSense"))); Lima::LinguisticProcessing::WordSenseDisambiguation::WordSenseAnnotation wsa; try { wsa = ga.value<Lima::LinguisticProcessing::WordSenseDisambiguation::WordSenseAnnotation>(); wsa.outputXml(m_ostream,g); } catch (const boost::bad_any_cast& e) { LOGINIT("WordSenseDisambiguator"); LERROR << "non word sense annotation"; } } else { Token* token = get(vertex_token, g, v); if (token != 0) { std::string s = Common::Misc::limastring2utf8stdstring(token->stringForm()); m_ostream << s; } } m_ostream << " "; }
CoreXmlReaderClient::CoreXmlReaderClient(Lima::Common::XMLConfigurationFiles::XMLConfigurationFileParser &configuration) : /*m_delegate(0),*/m_handler(0) { #ifdef DEBUG_LP XMLREADERCLIENTLOGINIT; LDEBUG << "CoreXmlReaderClient::CoreXmlReaderClient"; #endif ModuleConfigurationStructure &conf = configuration.getModuleConfiguration("lp-structuredXmlreaderclient"); m_documentReader = new DocumentsReader::DocumentReader(conf); m_documentReader->setLinguisticXMLDocHandler(this); m_emptyTextChars = utf8stdstring2limastring(" \t\n"); }
// takes a token LimaString Text::token() { TOKENIZERLOGINIT; // Creates a new token uint64_t delta = _curPtr; if (m_text[_curPtr] >= 0xD800 || _curPtr == _debPtr) { delta++; } if (_debPtr >= m_text.size()) { LERROR << "Empty token !"; _debPtr = delta; _curSettings.reset(); return utf8stdstring2limastring(""); } LimaString str=m_text.mid( _debPtr, (delta-_debPtr)); LDEBUG << " Adding token '" << str << "'"; StringsPoolIndex form=(*_stringsPool)[str]; Token *tToken = new Token(form,str,_debPtr+1,(delta-_debPtr)); if (tToken == 0) throw MemoryErrorException(); // @todo: set default status here, according to structured status (alpha,numeric etc...) // instead of setting it at each change of status (setAlphaCapital, setNumeric etc...) tToken->setStatus(_curSettings); // LDEBUG << " _curSettings is " << _curSettings.toString(); LDEBUG << " status is " << tToken->status().toString(); // Adds on the path LinguisticGraphVertex newVx=add_vertex(*_tTokenGraph); put(vertex_token,*_tTokenGraph,newVx,tToken); put(vertex_data,*_tTokenGraph,newVx,new MorphoSyntacticData()); add_edge(_currentVx,newVx,*_tTokenGraph); _currentVx=newVx; _debPtr = delta; _curSettings.reset(); return str; }
void DumpXMLAnnotationVisitor::examine_edge(LinguisticGraphEdge e, const LinguisticGraph& g) { COREFSOLVERLOGINIT; LDEBUG << "DumpXMLAnnotationVisitor::examine_edge"; LinguisticGraphVertex v = target(e, g); // let process sentences like (...) have automatically tuned (...) where the graph has one token "have_tuned" with one branch "automatically" "tuned" and another one with the following of the sentence LinguisticGraphOutEdgeIt it, it_end; boost::tie(it, it_end) = boost::out_edges(v,g); if (it == it_end) return; // let process sentences where one tag has not been fully determined and there is still two (or more) tag options LinguisticGraphVertex v2 = target(m_lastEdge, g); if (v2==v) return; if (m_lastEdge!=LinguisticGraphEdge() && are_equivalent(e, v2, v, g)) return; // begin // store this edge for the future tests if (get(vertex_token, g,v)!=0) m_lastEdge = e; // const FsaStringsPool& stringsPool= Common::MediaticData::MediaticData::single().stringsPool(m_language); Token* token = get(vertex_token, g, v); // processing of cases like "s'y introduire", tokenized as "y s'introduire" if (token != 0 && (token->stringForm() == "en" || token->stringForm() =="y")) { LinguisticGraphOutEdgeIt it, it_end; boost::tie(it, it_end) = boost::out_edges(v,g); if (it != it_end) { Token* t = get(vertex_token, g,target(*it, g)); if (t!=0 && Common::Misc::limastring2utf8stdstring(t->stringForm()).substr(0,2)=="s'") { m_ostream << "s'"; } } } // process std::set< AnnotationGraphVertex > matches = m_ad->matches("PosGraph",v,"annot"); if (matches.empty()) { COREFSOLVERLOGINIT; LERROR << "DumpXMLAnnotationVisitor::examine_edge No annotation graph vertex matches PoS graph vertex " << v << ". This should not happen."; return; } AnnotationGraphVertex av = *matches.begin(); if (m_ad->hasAnnotation(av, Common::Misc::utf8stdstring2limastring("Coreferent"))) { GenericAnnotation ga = (m_ad->annotation(av,utf8stdstring2limastring("Coreferent"))); Lima::LinguisticProcessing::Coreferences::CoreferentAnnotation ca; try { ca = ga.value<Lima::LinguisticProcessing::Coreferences::CoreferentAnnotation>(); ca.outputXml(m_ostream,g,m_ad); } catch (const boost::bad_any_cast& ) { COREFSOLVERLOGINIT; LERROR << "non coreferent annotation"<< LENDL; } } else { Token* token = get(vertex_token, g, v); if (token != 0) { std::string s = Common::Misc::limastring2utf8stdstring(token->stringForm()); // processing of cases like "s'y introduire", tokenized as "y s'introduire" if (s.substr(0,2) == "s'") { Token* t = get(vertex_token,g,source(e, g)); if (t!=0 && (Common::Misc::limastring2utf8stdstring(t->stringForm()).substr(0,2)=="en" || Common::Misc::limastring2utf8stdstring(t->stringForm()).substr(0,2)=="y")) { s = s.substr(2,s.size()); } } // processing of cases like "le Canada a-t-il envisagé...", où le mot entre "a" et "envisagé" se retrouverait rejeté après "a_envisagé". Nécessaire de traiter car problématique pour l'évaluation quand il s'agit d'un pronom clitique comme dans ce cas-ci. std::string formerMemo = m_memo; match_results<std::string::const_iterator> what; string::const_iterator start = s.begin(); string::const_iterator end = s.end(); if (regex_search(s, what, regex("_"))) { m_memo = std::string(what[0].second,end) + " "; s = std::string(start,what[0].first); } else m_memo = ""; m_ostream << formerMemo << s; if (token->status().isAlphaPossessive()) { m_ostream << "'s "; } } } m_ostream << " "; }
LimaStatusCode CorefSolvingNormalizedXmlLogger::process( AnalysisContent& analysis) const { // COREFSOLVERLOGINIT; TimeUtils::updateCurrentTime(); AnnotationData* annotationData = static_cast<AnnotationData*>(analysis.getData("AnnotationData")); const LinguisticAnalysisStructure::AnalysisGraph& graph = *(static_cast<LinguisticAnalysisStructure::AnalysisGraph*>(analysis.getData(m_graph))); // LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(graph.getGraph()); LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { COREFSOLVERLOGINIT; LERROR << "no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } ofstream out; if (!openLogFile(out,metadata->getMetaData("FileName"))) { COREFSOLVERLOGINIT; LERROR << "Can't open log file " << LENDL; return UNKNOWN_ERROR; } out << "<coreferences>" << endl; // LDEBUG << "CorefSolvingNormalizedXmlLogger on graph " << m_graph << LENDL; AnnotationGraphVertexIt itv, itv_end; boost::tie(itv, itv_end) = vertices(annotationData->getGraph()); for (; itv != itv_end; itv++) { // process //LDEBUG << "CorefSolvingNormalizedXmlLogger on annotation vertex " << *itv << LENDL; if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent"))) //if (annotationData->hasAnnotation(*itv,utf8stdstring2limastring("Coreferent"))) { CoreferentAnnotation* annot ; try { annot = annotationData->annotation(*itv,utf8stdstring2limastring("Coreferent")) .pointerValue<CoreferentAnnotation>(); } catch (const boost::bad_any_cast& ) { COREFSOLVERLOGINIT; LERROR << "One annotation on vertex " << *itv << " you are trying to cast is not a Coreference; Coreference not logged" << LENDL; for (int i = 0; i < 19 ; i++) { LERROR << "annot "<< i << " : " << limastring2utf8stdstring(annotationData->annotationName(i)) << LENDL ; } continue; } LinguisticProcessing::LinguisticAnalysisStructure::Token* token = get(vertex_token, *graph.getGraph(), annot->morphVertex()); if (token == 0) { COREFSOLVERLOGINIT; LERROR << "Vertex " << *itv << " has no entry in the analysis graph token map. This should not happen !!" << LENDL; } else { CoreferentAnnotation* antecedent; // bool hasAntecedent = false; AnnotationGraphOutEdgeIt it, it_end; boost::tie(it, it_end) = boost::out_edges(static_cast<AnnotationGraphVertex>(*itv), annotationData->getGraph()); for (; it != it_end; it++) { if (annotationData->hasAnnotation(target(*it,annotationData->getGraph()),utf8stdstring2limastring("Coreferent"))) { try { antecedent = annotationData->annotation(target(*it, annotationData->getGraph()), utf8stdstring2limastring("Coreferent")).pointerValue<CoreferentAnnotation>(); // hasAntecedent = true; } catch (const boost::bad_any_cast& ) { COREFSOLVERLOGINIT; LERROR << "One annotation on vertex you are trying to cast resulting from an edge out of " << *itv << " is not a Coreference; Coreference not logged" << LENDL; continue; } } } out << " <reference>\n" << " <pos>" << get(vertex_token,*graph.getGraph(),annot->morphVertex())->position() << "</pos>\n" << " <len>" << token->stringForm().length() << "</len>\n" << " <string>"<< limastring2utf8stdstring(transcodeToXmlEntities(token->stringForm())) << "</string>\n" << " <npId>" << annot->id() << "</npId>\n" << " <posVertex>" << annot->morphVertex() << "</posVertex>\n"; //if (hasAntecedent) if (false) { out << " <npRef>" << antecedent->id() << "</npRef>\n"; out << " <refPosVertex>" << antecedent->morphVertex() << "</refPosVertex>\n"; } out << " <categ>" << annot->categ() << "</categ>\n" << " </reference>\n" << endl; } } } out << "</coreferences>" << endl; out.close(); TimeUtils::logElapsedTime("CorefSolvingNormalizedXmlLogger"); return SUCCESS_ID; }
LimaStatusCode GeoDumper::process( AnalysisContent& analysis) const { DUMPERLOGINIT; LDEBUG << "Process GeoDumper "; LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "GeoDumper::process: no LinguisticMetaData ! abort"; return MISSING_DATA; } /*AnalysisHandlerContainer* handlerContainer=static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); if (handlerContainer == 0) { LERROR << "GeoDumper::process: no handler in analysisContent ! abort"; return MISSING_DATA; }*/ Lima::Common::AnnotationGraphs::AnnotationData* annotationData = static_cast< Lima::Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData==0) { LERROR << "GeoDumper::process: no AnnotationData ! abort"; return MISSING_DATA; } //AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(handlerContainer->getHandler()); LDEBUG << "handler will be: " << m_handler; //MediaId langid = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia(); AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler)); if (handler==0) { LERROR << "GeoDumper::process: handler " << m_handler << " has not been given to the core client"; return MISSING_DATA; } //handler->setOut(&std::cout); handler->startAnalysis(); HandlerStreamBuf hsb(handler); std::ostream out(&hsb); map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition > categoriesMapping; AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData(m_graph)); LinguisticGraph* graph=anagraph->getGraph(); ltNormProperty sorter(m_propertyAccessor); //const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); LinguisticGraphVertexIt vxItr,vxItrEnd; boost::tie(vxItr,vxItrEnd) = vertices(*graph); for (;vxItr!=vxItrEnd;vxItr++) { Token* ft=get(vertex_token,*graph,*vxItr); if( ft!=0) { std::pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> > element =categoriesMapping[ft]; element.second.push_back(get(vertex_data,*graph,*vxItr)); element.first=*vxItr; categoriesMapping[ft]=element; } } for (map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition >::const_iterator ftItr=categoriesMapping.begin(); ftItr!=categoriesMapping.end(); ftItr++) { Token* ft=ftItr->first; std::ostringstream os; // get position uint64_t position=ft->position() + metadata->getStartOffset(); // get string std::string str=Common::Misc::limastring2utf8stdstring(ft->stringForm()); // replace separator in string by '_' string::size_type sepLen=m_sep.size(); string::size_type p=0; while ( (p = str.find(m_sep, p)) != string::npos ) { str.replace( p, sepLen, "_"); p++; } // newlines (paragraphes) => print empty line //if (str=="\n") { // os << str; continue; //} if (m_printPosition) { os << position << m_sep; } os << str << m_sep; // POS std::set<LinguisticCode> props; vector<MorphoSyntacticData*> vt=ftItr->second.second; for (vector<MorphoSyntacticData*>::const_iterator dataItr=vt.begin(); dataItr!=vt.end(); dataItr++) { MorphoSyntacticData* data=*dataItr; sort(data->begin(),data->end(),sorter); //StringsPoolIndex norm(0),curNorm(0); LinguisticCode prop(0); // output first MorphoSyntacticData::const_iterator elemIt=data->begin(),elemIt_end=data->end(); //norm=elemIt->normalizedForm; if(elemIt != elemIt_end) { prop=m_propertyAccessor->readValue(elemIt->properties); os << m_propertyManager->getPropertySymbolicValue(prop); props.insert(prop); // output rest, with separator for ( elemIt++; elemIt!=elemIt_end; elemIt++) { //curNorm=elemIt->normalizedForm; prop=m_propertyAccessor->readValue(elemIt->properties); //if ((curNorm != norm) || (curProp != prop)) { // norm=curNorm; // prop=curProp; if (props.find(prop)==props.end()) { os << m_sepPOS << m_propertyManager->getPropertySymbolicValue(prop); props.insert(prop); } } } } std::set< AnnotationGraphVertex > matches = annotationData->matches(anagraph->getGraphId(),(ftItr->second).first,"annot"); if (annotationData->hasAnnotation(*matches.begin(),utf8stdstring2limastring("GeoEntity"))) { os << m_sep; os << annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity")) .pointerValue<GeoEntityAnnotation>()->getPosition(); os << m_sep; std::set<std::string> classes=annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity")) .pointerValue<GeoEntityAnnotation>()->getGeoClasses(); for (std::set<std::string>::iterator iT = classes.begin(); iT!=classes.end();) { os << *iT; iT++; if (iT!=classes.end()) os << " "; } } out << os.str(); out << endl; } out.flush(); handler->endAnalysis(); return SUCCESS_ID; }