LimaStatusCode LinearTextRepresentationDumper::process( AnalysisContent& analysis) const { DUMPERLOGINIT; // get metadata LinguisticMetaData* metadata=dynamic_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "LinearTextRepresentationDumper::process: no LinguisticMetaData ! abort" << LENDL; return MISSING_DATA; } // get the analysis graph AnalysisGraph* anaGraph = dynamic_cast<AnalysisGraph*>(analysis.getData("PosGraph")); if (anaGraph == 0) { LERROR << "LinearTextRepresentationDumper::process: no AnalysisGraph ! abort" << LENDL; return MISSING_DATA; } // get sentence boundaries SegmentationData* sb = dynamic_cast<SegmentationData*>(analysis.getData("SentenceBoundaries")); if (sb == 0) { LERROR << "LinearTextRepresentationDumper::process: no SentenceBounds ! abort" << LENDL; return MISSING_DATA; } // build LTRText LTR_Text textRep; LTRTextBuilder builder(m_language, m_stopList); builder.buildLTRTextFrom( *(anaGraph->getGraph()), sb, anaGraph->lastVertex(), &textRep, metadata->getStartOffset()); // write LTR_Text LDEBUG << "handler will be: " << m_handler << LENDL; // MediaId langid = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia(); AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler)); if (handler == 0) { LERROR << "LinearTextRepresentationDumper::process: handler " << m_handler << " has not been given to the core client" << LENDL; return MISSING_DATA; } handler->startAnalysis(); HandlerStreamBuf hsb(handler); ostream out(&hsb); LDEBUG << textRep << LENDL; textRep.binaryWriteOn(out); out.flush(); handler->endAnalysis(); return SUCCESS_ID; }
DumperStream* AbstractTextualAnalysisDumper:: initialize(AnalysisContent& analysis) const { DUMPERLOGINIT; LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream" << LENDL; // if handler is defined, find handler if (! m_handlerName.empty()) { LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with handler "<< m_handlerName << LENDL; AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handlerName)); if (handler==0) { DUMPERLOGINIT; LWARN << "handler " << handler << " has not been given to the core client" << LENDL; } else { return new DumperStream(handler); } } if (! m_outputFile.empty()) { LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile << LENDL; return new DumperStream(m_outputFile,m_append); } if (! m_outputSuffix.empty()) { LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { DUMPERLOGINIT; LERROR << "AbstractTextualAnalysisDumper::initialize: no LinguisticMetaData ! abort" << LENDL; } else { std::string sourceFile(metadata->getMetaData("FileName")); LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output suffix " << m_outputSuffix << " on file " << sourceFile << LENDL; string outputFile=sourceFile + m_outputSuffix; return new DumperStream(outputFile,m_append); } } // return LERROR << "AbstractTextualAnalysisDumper::initialize: missing parameters to initialize output stream: use default file 'output'" << LENDL; return new DumperStream("output",m_append); }
LimaStatusCode EasyXmlDumper::process(AnalysisContent& analysis) const { TimeUtils::updateCurrentTime(); DUMPERLOGINIT; LinguisticMetaData* metadata = static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "EasyXmlDumper::process no LinguisticMetaData ! abort"; return MISSING_DATA; } string filename = metadata->getMetaData("FileName"); LDEBUG << "EasyXmlDumper::process Filename: " << filename; LDEBUG << "handler will be: " << m_handler; // MediaId langid = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia(); AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler)); if (handler==0) { LERROR << "EasyXmlDumper::process: handler " << m_handler << " has not been given to the core client"; return MISSING_DATA; } AnalysisGraph* graph = static_cast<AnalysisGraph*>(analysis.getData(m_graph)); if (graph == 0) { graph = new AnalysisGraph(m_graph,m_language,true,true); analysis.setData(m_graph,graph); } SyntacticData* syntacticData = static_cast<SyntacticData*>(analysis.getData("SyntacticData")); if (syntacticData == 0) { syntacticData = new SyntacticAnalysis::SyntacticData(static_cast<AnalysisGraph*>(analysis.getData(m_graph)),0); syntacticData->setupDependencyGraph(); analysis.setData("SyntacticData",syntacticData); } AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData == 0) { annotationData = new AnnotationData(); if (static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")) != 0) { static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph"); } analysis.setData("AnnotationData",annotationData); } handler->startAnalysis(); HandlerStreamBuf hsb(handler); std::ostream outputStream(&hsb); LDEBUG << "EasyXmlDumper:: process before printing heading"; AnalysisGraph* anaGraph = static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")); AnalysisGraph* posGraph = static_cast<AnalysisGraph*>(analysis.getData("PosGraph")); if (anaGraph != 0 && posGraph != 0) { LDEBUG << "EasyXmlDumper:: begin of posgraph"; std::vector< bool > alreadyDumpedTokens; std::map< LinguisticAnalysisStructure::Token*, uint64_t > fullTokens; LinguisticGraphVertexIt i, i_end; uint64_t id = 0; alreadyDumpedTokens.resize(num_vertices(*posGraph->getGraph())); for (boost::tie(i, i_end) = vertices(*posGraph->getGraph()); i != i_end; ++i) { LDEBUG << "EasyXmlDumper:: examine posgraph for " << id; alreadyDumpedTokens[id] = false; fullTokens[get(vertex_token, *posGraph->getGraph(), *i)] = id; id++; } /* No need for sentence boundaries in Easy input LinguisticGraphVertex sentenceBegin = sb->getStartVertex(); SegmentationData::iterator sbItr = sb->begin(); LinguisticGraphVertex sentenceBegin = sb->getStartVertex(); SegmentationData::iterator sbItr = sb->begin(); */ LinguisticGraphVertex sentenceBegin = posGraph->firstVertex(); LinguisticGraphVertex sentenceEnd = posGraph->lastVertex(); string sentIdPrefix; try { sentIdPrefix = metadata->getMetaData("docid"); LDEBUG << "EasyXmlDumper:: retrieve sentence id " << sentIdPrefix; }catch (LinguisticProcessingException& ) { sentIdPrefix = ""; } if(sentIdPrefix.length() <= 0) sentIdPrefix = "E"; /* No need for sentence boundaries in Easy input while (sbItr != sb->end()) { LinguisticGraphVertex sentenceEnd = *sbItr; */ LDEBUG << "EasyXmlDumper:: inside posgraph while "; dumpLimaData(outputStream, sentenceBegin, sentenceEnd, *anaGraph, *posGraph, *annotationData, *syntacticData, "PosGraph", alreadyDumpedTokens, fullTokens, sentIdPrefix); /* No need for sentence boundaries in Easy input sentenceBegin = sentenceEnd; sbItr++; } */ LDEBUG << "EasyXmlDumper:: end of posgraph"; } return SUCCESS_ID; }
LimaStatusCode GeoDumper::process( AnalysisContent& analysis) const { DUMPERLOGINIT; LDEBUG << "Process GeoDumper "; LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData")); if (metadata == 0) { LERROR << "GeoDumper::process: no LinguisticMetaData ! abort"; return MISSING_DATA; } /*AnalysisHandlerContainer* handlerContainer=static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); if (handlerContainer == 0) { LERROR << "GeoDumper::process: no handler in analysisContent ! abort"; return MISSING_DATA; }*/ Lima::Common::AnnotationGraphs::AnnotationData* annotationData = static_cast< Lima::Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData")); if (annotationData==0) { LERROR << "GeoDumper::process: no AnnotationData ! abort"; return MISSING_DATA; } //AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(handlerContainer->getHandler()); LDEBUG << "handler will be: " << m_handler; //MediaId langid = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia(); AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer")); AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler)); if (handler==0) { LERROR << "GeoDumper::process: handler " << m_handler << " has not been given to the core client"; return MISSING_DATA; } //handler->setOut(&std::cout); handler->startAnalysis(); HandlerStreamBuf hsb(handler); std::ostream out(&hsb); map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition > categoriesMapping; AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData(m_graph)); LinguisticGraph* graph=anagraph->getGraph(); ltNormProperty sorter(m_propertyAccessor); //const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language); LinguisticGraphVertexIt vxItr,vxItrEnd; boost::tie(vxItr,vxItrEnd) = vertices(*graph); for (;vxItr!=vxItrEnd;vxItr++) { Token* ft=get(vertex_token,*graph,*vxItr); if( ft!=0) { std::pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> > element =categoriesMapping[ft]; element.second.push_back(get(vertex_data,*graph,*vxItr)); element.first=*vxItr; categoriesMapping[ft]=element; } } for (map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition >::const_iterator ftItr=categoriesMapping.begin(); ftItr!=categoriesMapping.end(); ftItr++) { Token* ft=ftItr->first; std::ostringstream os; // get position uint64_t position=ft->position() + metadata->getStartOffset(); // get string std::string str=Common::Misc::limastring2utf8stdstring(ft->stringForm()); // replace separator in string by '_' string::size_type sepLen=m_sep.size(); string::size_type p=0; while ( (p = str.find(m_sep, p)) != string::npos ) { str.replace( p, sepLen, "_"); p++; } // newlines (paragraphes) => print empty line //if (str=="\n") { // os << str; continue; //} if (m_printPosition) { os << position << m_sep; } os << str << m_sep; // POS std::set<LinguisticCode> props; vector<MorphoSyntacticData*> vt=ftItr->second.second; for (vector<MorphoSyntacticData*>::const_iterator dataItr=vt.begin(); dataItr!=vt.end(); dataItr++) { MorphoSyntacticData* data=*dataItr; sort(data->begin(),data->end(),sorter); //StringsPoolIndex norm(0),curNorm(0); LinguisticCode prop(0); // output first MorphoSyntacticData::const_iterator elemIt=data->begin(),elemIt_end=data->end(); //norm=elemIt->normalizedForm; if(elemIt != elemIt_end) { prop=m_propertyAccessor->readValue(elemIt->properties); os << m_propertyManager->getPropertySymbolicValue(prop); props.insert(prop); // output rest, with separator for ( elemIt++; elemIt!=elemIt_end; elemIt++) { //curNorm=elemIt->normalizedForm; prop=m_propertyAccessor->readValue(elemIt->properties); //if ((curNorm != norm) || (curProp != prop)) { // norm=curNorm; // prop=curProp; if (props.find(prop)==props.end()) { os << m_sepPOS << m_propertyManager->getPropertySymbolicValue(prop); props.insert(prop); } } } } std::set< AnnotationGraphVertex > matches = annotationData->matches(anagraph->getGraphId(),(ftItr->second).first,"annot"); if (annotationData->hasAnnotation(*matches.begin(),utf8stdstring2limastring("GeoEntity"))) { os << m_sep; os << annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity")) .pointerValue<GeoEntityAnnotation>()->getPosition(); os << m_sep; std::set<std::string> classes=annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity")) .pointerValue<GeoEntityAnnotation>()->getGeoClasses(); for (std::set<std::string>::iterator iT = classes.begin(); iT!=classes.end();) { os << *iT; iT++; if (iT!=classes.end()) os << " "; } } out << os.str(); out << endl; } out.flush(); handler->endAnalysis(); return SUCCESS_ID; }