LimaStatusCode LinearTextRepresentationDumper::process(
    AnalysisContent& analysis) const {

    DUMPERLOGINIT;
    // get metadata    
    LinguisticMetaData* metadata=dynamic_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
    if (metadata == 0) {
        LERROR << "LinearTextRepresentationDumper::process: no LinguisticMetaData ! abort" << LENDL;
        return MISSING_DATA;
    }
    // get the analysis graph    
    AnalysisGraph* anaGraph = dynamic_cast<AnalysisGraph*>(analysis.getData("PosGraph"));
    if (anaGraph == 0) {
        LERROR << "LinearTextRepresentationDumper::process: no AnalysisGraph ! abort" << LENDL;
        return MISSING_DATA;
    }
    // get sentence boundaries    
    SegmentationData* sb = dynamic_cast<SegmentationData*>(analysis.getData("SentenceBoundaries"));
    if (sb == 0) {
        LERROR << "LinearTextRepresentationDumper::process: no SentenceBounds ! abort" << LENDL;
        return MISSING_DATA;
    }
    // build LTRText
    LTR_Text textRep;
    LTRTextBuilder builder(m_language, m_stopList);
    builder.buildLTRTextFrom(
        *(anaGraph->getGraph()),
        sb,
        anaGraph->lastVertex(),
        &textRep,
        metadata->getStartOffset());
    // write LTR_Text
    LDEBUG << "handler will be: " << m_handler << LENDL;
//     MediaId langid = static_cast<const  Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia();
    AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
    AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler));
    if (handler == 0) {
      LERROR << "LinearTextRepresentationDumper::process: handler " << m_handler << " has not been given to the core client" << LENDL;
      return MISSING_DATA;
    }    
    handler->startAnalysis();
    HandlerStreamBuf hsb(handler);
    ostream out(&hsb);
    LDEBUG << textRep << LENDL;
    textRep.binaryWriteOn(out);
    out.flush();
    handler->endAnalysis();
    return SUCCESS_ID;
}
DumperStream* AbstractTextualAnalysisDumper::
initialize(AnalysisContent& analysis) const
{
  DUMPERLOGINIT;
  LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream" << LENDL;
  
  // if handler is defined, find handler
  if (! m_handlerName.empty()) {
    LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with handler "<< m_handlerName << LENDL;
    AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
    AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handlerName));
    if (handler==0)
    {
      DUMPERLOGINIT;
      LWARN << "handler " << handler << " has not been given to the core client" << LENDL;
    }
    else {
      return new DumperStream(handler);
    }
  }

  if (! m_outputFile.empty()) {
    LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output file "<< m_outputFile << LENDL;
    return new DumperStream(m_outputFile,m_append);
  }

  if (! m_outputSuffix.empty()) {
    LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
    if (metadata == 0) {
      DUMPERLOGINIT;
      LERROR << "AbstractTextualAnalysisDumper::initialize: no LinguisticMetaData ! abort" << LENDL;
    }
    else {
      std::string sourceFile(metadata->getMetaData("FileName"));
      LDEBUG << "AbstractTextualAnalysisDumper: initialize DumperStream with output suffix "
             << m_outputSuffix << " on file " << sourceFile << LENDL;
      string outputFile=sourceFile + m_outputSuffix;
      return new DumperStream(outputFile,m_append);
    }
  }

  // return
  LERROR << "AbstractTextualAnalysisDumper::initialize: missing parameters to initialize output stream: use default file 'output'" << LENDL;
  return new DumperStream("output",m_append);
}
示例#3
0
LimaStatusCode EasyXmlDumper::process(AnalysisContent& analysis) const
{
  TimeUtils::updateCurrentTime();
  DUMPERLOGINIT;

  LinguisticMetaData* metadata = static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
    LERROR << "EasyXmlDumper::process no LinguisticMetaData ! abort";
      return MISSING_DATA;
  }
  string filename = metadata->getMetaData("FileName");
  LDEBUG << "EasyXmlDumper::process Filename: " << filename;

  LDEBUG << "handler will be: " << m_handler;
//   MediaId langid = static_cast<const  Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia();
  AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
  AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler));
  if (handler==0)
  {
    LERROR << "EasyXmlDumper::process: handler " << m_handler << " has not been given to the core client";
    return MISSING_DATA;
  }
  
  AnalysisGraph* graph = static_cast<AnalysisGraph*>(analysis.getData(m_graph));
  if (graph == 0)
  {
    graph = new AnalysisGraph(m_graph,m_language,true,true);
    analysis.setData(m_graph,graph);
  }

  SyntacticData* syntacticData = static_cast<SyntacticData*>(analysis.getData("SyntacticData"));
  if (syntacticData == 0)
  {
    syntacticData = new SyntacticAnalysis::SyntacticData(static_cast<AnalysisGraph*>(analysis.getData(m_graph)),0);
    syntacticData->setupDependencyGraph();
    analysis.setData("SyntacticData",syntacticData);
  }

  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData == 0)
  {
    annotationData = new AnnotationData();
    if (static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")) != 0)
    {
      static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph");
    }
    analysis.setData("AnnotationData",annotationData);
  }

  handler->startAnalysis();
  HandlerStreamBuf hsb(handler);
  std::ostream outputStream(&hsb);

  LDEBUG << "EasyXmlDumper:: process before printing heading";
  AnalysisGraph* anaGraph = static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"));
  AnalysisGraph* posGraph = static_cast<AnalysisGraph*>(analysis.getData("PosGraph"));
  if (anaGraph != 0 && posGraph != 0)
  {
    LDEBUG << "EasyXmlDumper:: begin of posgraph";
    std::vector< bool > alreadyDumpedTokens;
    std::map< LinguisticAnalysisStructure::Token*, uint64_t > fullTokens;
    LinguisticGraphVertexIt i, i_end;
    uint64_t id = 0;
    alreadyDumpedTokens.resize(num_vertices(*posGraph->getGraph()));
    for (boost::tie(i, i_end) = vertices(*posGraph->getGraph()); i != i_end; ++i)
    {
      LDEBUG << "EasyXmlDumper:: examine posgraph for " << id;
      alreadyDumpedTokens[id] = false;
      fullTokens[get(vertex_token, *posGraph->getGraph(), *i)] = id;
      id++;
    }
    /* No need for sentence boundaries in Easy input
    LinguisticGraphVertex sentenceBegin = sb->getStartVertex();
    SegmentationData::iterator sbItr = sb->begin();
    LinguisticGraphVertex sentenceBegin = sb->getStartVertex();
    SegmentationData::iterator sbItr = sb->begin();
    */
    LinguisticGraphVertex sentenceBegin = posGraph->firstVertex();
    LinguisticGraphVertex sentenceEnd = posGraph->lastVertex();
    string sentIdPrefix;
    try {
      sentIdPrefix = metadata->getMetaData("docid");
      LDEBUG << "EasyXmlDumper:: retrieve sentence id " << sentIdPrefix;
    }catch (LinguisticProcessingException& ) {
      sentIdPrefix = "";
    }
    if(sentIdPrefix.length() <= 0)
      sentIdPrefix = "E";
    /* No need for sentence boundaries in Easy input
    while (sbItr != sb->end())
    {
      LinguisticGraphVertex sentenceEnd = *sbItr;
    */
    LDEBUG << "EasyXmlDumper:: inside posgraph while ";
    dumpLimaData(outputStream,
                  sentenceBegin,
                  sentenceEnd,
                  *anaGraph,
                  *posGraph,
                  *annotationData,
                  *syntacticData,
                  "PosGraph",
                  alreadyDumpedTokens,
                  fullTokens,
                  sentIdPrefix);
    /* No need for sentence boundaries in Easy input
      sentenceBegin = sentenceEnd;
      sbItr++;
    }
    */
    LDEBUG << "EasyXmlDumper:: end of posgraph";
  }

  return SUCCESS_ID;
}
示例#4
0
LimaStatusCode GeoDumper::process(
  AnalysisContent& analysis) const
{
  DUMPERLOGINIT;
  LDEBUG << "Process GeoDumper ";
  LinguisticMetaData* metadata=static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
      LERROR << "GeoDumper::process: no LinguisticMetaData ! abort";
      return MISSING_DATA;
  }
  /*AnalysisHandlerContainer* handlerContainer=static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
  if (handlerContainer == 0) {
    LERROR << "GeoDumper::process: no handler in analysisContent ! abort";
    return MISSING_DATA;
  }*/

  Lima::Common::AnnotationGraphs::AnnotationData* annotationData = static_cast< Lima::Common::AnnotationGraphs::AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData==0)
  {
    LERROR << "GeoDumper::process: no AnnotationData ! abort";
    return MISSING_DATA;
  }

  //AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(handlerContainer->getHandler());
  LDEBUG << "handler will be: " << m_handler;
  //MediaId langid = static_cast<const  Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia();
  AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
  AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler));
  if (handler==0)
  {
    LERROR << "GeoDumper::process: handler " << m_handler << " has not been given to the core client";
    return MISSING_DATA;
  }
  //handler->setOut(&std::cout);
  
  handler->startAnalysis();
  HandlerStreamBuf hsb(handler);
  std::ostream out(&hsb);

  map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition > categoriesMapping;

  AnalysisGraph* anagraph=static_cast<AnalysisGraph*>(analysis.getData(m_graph));
  LinguisticGraph* graph=anagraph->getGraph();
  ltNormProperty sorter(m_propertyAccessor);
  //const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language);

  LinguisticGraphVertexIt vxItr,vxItrEnd;
  boost::tie(vxItr,vxItrEnd) = vertices(*graph);
  for (;vxItr!=vxItrEnd;vxItr++)
  {
    Token* ft=get(vertex_token,*graph,*vxItr);
    if( ft!=0)
    {
      std::pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> > element =categoriesMapping[ft];
      element.second.push_back(get(vertex_data,*graph,*vxItr));
      element.first=*vxItr;
      categoriesMapping[ft]=element;
    }
  }

  for (map<Token*, pair<LinguisticGraphVertex,vector<MorphoSyntacticData*> >, lTokenPosition >::const_iterator ftItr=categoriesMapping.begin();
       ftItr!=categoriesMapping.end();
       ftItr++)
  {

    Token* ft=ftItr->first;
    std::ostringstream os;
    // get position
    uint64_t position=ft->position() + metadata->getStartOffset();
    // get string
    std::string str=Common::Misc::limastring2utf8stdstring(ft->stringForm());
    // replace separator in string by '_'
    string::size_type sepLen=m_sep.size();
    string::size_type p=0;
    while ( (p = str.find(m_sep, p)) != string::npos ) {
      str.replace( p, sepLen, "_");
      p++;
    }
    
    // newlines (paragraphes) => print empty line
    //if (str=="\n") {
    //  os << str; continue;
    //}
    if (m_printPosition) {
      os << position << m_sep;
    }
    os << str << m_sep;
    // POS
    std::set<LinguisticCode> props;
    vector<MorphoSyntacticData*> vt=ftItr->second.second;
    for (vector<MorphoSyntacticData*>::const_iterator dataItr=vt.begin();
           dataItr!=vt.end();
           dataItr++)
    {
        MorphoSyntacticData* data=*dataItr;
        sort(data->begin(),data->end(),sorter);
        //StringsPoolIndex norm(0),curNorm(0);
        LinguisticCode prop(0);
    
        // output first 
        MorphoSyntacticData::const_iterator elemIt=data->begin(),elemIt_end=data->end();
        //norm=elemIt->normalizedForm;
        if(elemIt != elemIt_end)
        {
          prop=m_propertyAccessor->readValue(elemIt->properties);
          os << m_propertyManager->getPropertySymbolicValue(prop);
          props.insert(prop);
        
          // output rest, with separator
          for ( elemIt++; elemIt!=elemIt_end; elemIt++)
          {
            //curNorm=elemIt->normalizedForm;
            prop=m_propertyAccessor->readValue(elemIt->properties);
            //if ((curNorm != norm) || (curProp != prop)) {
            //  norm=curNorm;
            //  prop=curProp;
            if (props.find(prop)==props.end()) {
              os << m_sepPOS << m_propertyManager->getPropertySymbolicValue(prop);
              props.insert(prop);
            }
          }
        }
    }
    std::set< AnnotationGraphVertex > matches = annotationData->matches(anagraph->getGraphId(),(ftItr->second).first,"annot");
    if (annotationData->hasAnnotation(*matches.begin(),utf8stdstring2limastring("GeoEntity")))
    {
      os << m_sep;
      os << annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity"))
        .pointerValue<GeoEntityAnnotation>()->getPosition();
      os << m_sep;
      std::set<std::string> classes=annotationData->annotation(*matches.begin(),Common::Misc::utf8stdstring2limastring("GeoEntity"))
        .pointerValue<GeoEntityAnnotation>()->getGeoClasses();
      for (std::set<std::string>::iterator iT = classes.begin(); iT!=classes.end();)
      {
       os << *iT;
       iT++;
       if (iT!=classes.end()) os << " "; 
       }
    }
    out << os.str();
    out << endl;
  }

  out.flush();
  handler->endAnalysis();

  return SUCCESS_ID;
}