Exemplo n.º 1
0
void FullTokenXmlLogger::dump(std::ostream& xmlStream,
                              AnalysisGraph& tTokenList) const
{
  //LASLOGINIT;
  xmlStream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
  xmlStream << "<!--generated by MM project on ";
  //    const uint64_t dateLen = strlen("Tue Oct 22 13:42:36 2002");
  time_t aclock;
  time(&aclock);                   /* Get time in seconds */
  std::string str(ctime(&aclock));
  xmlStream << str;
  xmlStream << "-->" << std::endl;
  xmlStream << "<?xml-stylesheet type=\"text/xsl\" href=\"DataStructure.xslt\"?>" << std::endl;
  xmlStream << "<data_structure xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"";
  xmlStream << " xsi:noNamespaceSchemaLocation=\"DataStructure.xsd\">" << std::endl;

  // dump the graph
  const FsaStringsPool& sp=Common::MediaticData::MediaticData::single().stringsPool(m_language);
  DumpXMLVisitor vis(xmlStream,*m_propertyCodeManager,sp);
  breadth_first_search(*(tTokenList.getGraph()),
                       tTokenList.firstVertex(),
                       visitor(vis));

  xmlStream << "</data_structure>" << std::endl;
}
LimaStatusCode LinearTextRepresentationDumper::process(
    AnalysisContent& analysis) const {

    DUMPERLOGINIT;
    // get metadata    
    LinguisticMetaData* metadata=dynamic_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
    if (metadata == 0) {
        LERROR << "LinearTextRepresentationDumper::process: no LinguisticMetaData ! abort" << LENDL;
        return MISSING_DATA;
    }
    // get the analysis graph    
    AnalysisGraph* anaGraph = dynamic_cast<AnalysisGraph*>(analysis.getData("PosGraph"));
    if (anaGraph == 0) {
        LERROR << "LinearTextRepresentationDumper::process: no AnalysisGraph ! abort" << LENDL;
        return MISSING_DATA;
    }
    // get sentence boundaries    
    SegmentationData* sb = dynamic_cast<SegmentationData*>(analysis.getData("SentenceBoundaries"));
    if (sb == 0) {
        LERROR << "LinearTextRepresentationDumper::process: no SentenceBounds ! abort" << LENDL;
        return MISSING_DATA;
    }
    // build LTRText
    LTR_Text textRep;
    LTRTextBuilder builder(m_language, m_stopList);
    builder.buildLTRTextFrom(
        *(anaGraph->getGraph()),
        sb,
        anaGraph->lastVertex(),
        &textRep,
        metadata->getStartOffset());
    // write LTR_Text
    LDEBUG << "handler will be: " << m_handler << LENDL;
//     MediaId langid = static_cast<const  Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia();
    AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
    AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler));
    if (handler == 0) {
      LERROR << "LinearTextRepresentationDumper::process: handler " << m_handler << " has not been given to the core client" << LENDL;
      return MISSING_DATA;
    }    
    handler->startAnalysis();
    HandlerStreamBuf hsb(handler);
    ostream out(&hsb);
    LDEBUG << textRep << LENDL;
    textRep.binaryWriteOn(out);
    out.flush();
    handler->endAnalysis();
    return SUCCESS_ID;
}
Exemplo n.º 3
0
std::string Constraint::
checkStringDebug(const AnalysisGraph& graph,
                 const LinguisticGraphVertex vertex) const {
  string functionName;
  LimaString complement;

  if (! ConstraintFunctionManager::single().
      getFunctionName(m_functionAddr,
                      functionName,
                      complement)) {
    AULOGINIT;
    LERROR << "constraint function "
           << m_functionAddr << " not availale" << LENDL;
  }

  ostringstream oss;
  oss << "Constraint:" << actionString()
      << " vertex " << vertex
      << " (" << Common::Misc::limastring2utf8stdstring((get(vertex_token,*(graph.getGraph()),vertex))->stringForm()) << ")"
      << " in constraint " << m_index
      << ",compl=" 
      << Common::Misc::limastring2utf8stdstring(complement)
      << "\", using function " << functionName;
  return oss.str();
}
LimaStatusCode LinearTextRepresentationLogger::process(
    AnalysisContent& analysis) const {

    DUMPERLOGINIT;
    // get metadata
    LinguisticMetaData* metadata=dynamic_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
    if (metadata == 0) {
        LERROR << "no LinguisticMetaData ! abort";
        return MISSING_DATA;
    }
    // get the analysis graph
    AnalysisGraph* anaGraph = dynamic_cast<AnalysisGraph*>(analysis.getData("PosGraph"));
    if (anaGraph == 0) {
        LERROR << "no AnalysisGraph ! abort";
        return MISSING_DATA;
    }
    // get sentence boundaries
    SegmentationData* sb = dynamic_cast<SegmentationData*>(analysis.getData("SentenceBoundaries"));
    if (sb == 0) {
      LDEBUG << "LinearTextRepresentationDumper::process: no SentenceBounds available: ignored";
      // sentence bounds ignored: null pointer passed to LTRTextBuilder will be handled there
    }
    // build LTRText
    LTR_Text textRep;
    LTRTextBuilder builder(m_language, m_stopList);
    builder.buildLTRTextFrom(
        *(anaGraph->getGraph()),
        sb,
        anaGraph->firstVertex(),
        anaGraph->lastVertex(),
        &textRep,
        metadata->getStartOffset());

    // write LTR_Text
    string textFileName = metadata->getMetaData("FileName");
    string outputFile = textFileName + m_outputSuffix;
    ofstream out(outputFile.c_str(), std::ofstream::binary);
    if (!out.good()) {
        throw runtime_error("can't open file " + outputFile);
    }
    textRep.binaryWriteOn(out);
    out.flush();
    out.close();
    return SUCCESS_ID;
}
Exemplo n.º 5
0
//***********************************************************************
// main function for outputing the graph
//***********************************************************************
void EasyXmlDumper::dumpLimaData(std::ostream& os,
                                  const LinguisticGraphVertex& begin,
                                  const LinguisticGraphVertex& end,
                                  const AnalysisGraph& anaGraph,
                                  const AnalysisGraph& posGraph,
                                  const AnnotationData& annotationData,
                                  const SyntacticData& syntacticData,
                                  const std::string& graphId,
                                  std::vector< bool >& alreadyDumpedTokens,
                                  std::map< LinguisticAnalysisStructure::Token*, uint64_t >& fullTokens,
                                  std::string sentIdPrefix) const
{

  DUMPERLOGINIT;
  LDEBUG << "EasyXmlDumper:: dumpLimaData parameters: ";
  LDEBUG << "EasyXmlDumper::   begin = " << begin;
  LDEBUG << "EasyXmlDumper::   end = " << end;
  LDEBUG << "EasyXmlDumper::   posgraph first vertex = " << posGraph.firstVertex();
  LDEBUG << "EasyXmlDumper::   posgraph last vertex = " << posGraph.lastVertex();
  LDEBUG << "EasyXmlDumper::   graphId = " << graphId;
  LDEBUG << "EasyXmlDumper::   sentIdPrefix = " << sentIdPrefix;

  // just in case we want to check alreadt dumped tokens' array
  for (uint64_t i = 0; i<alreadyDumpedTokens.size(); i++)
  {
    if (alreadyDumpedTokens[i])
    {
      LDEBUG << "EasyXmlDumper:: already_dumped_tokens[" << i << "] =" << alreadyDumpedTokens[i];
    }
  }

  std::string sentIdStr = sentIdPrefix;
  if(find(m_sentIds.begin(), m_sentIds.end(), sentIdStr) != m_sentIds.end() || sentIdStr == "E" )
  {
    uint64_t sentIdsuffix = 0;
    do{
      sentIdsuffix++;
      std::stringstream sentIdStream;
      sentIdStream << sentIdPrefix << sentIdsuffix;
      sentIdStr = sentIdStream.str();
    }while(find(m_sentIds.begin(), m_sentIds.end(), sentIdStr) != m_sentIds.end());
  }

  LDEBUG << "EasyXmlDumper:: searching and extracting vertices and relations";
  LinguisticGraph* anaGraphL = const_cast<LinguisticGraph*>(anaGraph.getGraph());
  LinguisticGraph* posGraphL = const_cast<LinguisticGraph*>(posGraph.getGraph());
  ConstituantAndRelationExtractor care(m_propertyCodeManager);
  care.visitBoostGraph(begin,
                       end,
                       *anaGraphL,
                       *posGraphL,
                       annotationData,
                       syntacticData,
                       fullTokens,
                       alreadyDumpedTokens,
                       m_language);

  LDEBUG << "EasyXmlDumper:: all found vertices and relations extracted";
  care.replaceSEWithCompounds();
  care.constructionDesRelationsEntrantes();
  care.splitCompoundTenses();
  care.constructionDesGroupes();
  care.addLastFormsInGroups();

  EasyDumper ed(care, m_typeMapping, m_srcTag, m_tgtTag, sentIdStr);
  std::stringstream sentEasyStream;
  ed.dump(sentEasyStream);
  if(sentEasyStream.str().length() > 0)
  {
    // Makes object mutable for adding sentence ID
    EasyXmlDumper* self = const_cast<EasyXmlDumper*>(this);
    self->m_sentIds.push_back(sentIdStr);
    os << "<E id=\"" << sentIdStr << "\">" << std::endl;
    os << sentEasyStream.str();
    os << "</E>" << std::endl;
  }

}
Exemplo n.º 6
0
LimaStatusCode EasyXmlDumper::process(AnalysisContent& analysis) const
{
  TimeUtils::updateCurrentTime();
  DUMPERLOGINIT;

  LinguisticMetaData* metadata = static_cast<LinguisticMetaData*>(analysis.getData("LinguisticMetaData"));
  if (metadata == 0) {
    LERROR << "EasyXmlDumper::process no LinguisticMetaData ! abort";
      return MISSING_DATA;
  }
  string filename = metadata->getMetaData("FileName");
  LDEBUG << "EasyXmlDumper::process Filename: " << filename;

  LDEBUG << "handler will be: " << m_handler;
//   MediaId langid = static_cast<const  Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(metadata->getMetaData("Lang"))).getMedia();
  AnalysisHandlerContainer* h = static_cast<AnalysisHandlerContainer*>(analysis.getData("AnalysisHandlerContainer"));
  AbstractTextualAnalysisHandler* handler = static_cast<AbstractTextualAnalysisHandler*>(h->getHandler(m_handler));
  if (handler==0)
  {
    LERROR << "EasyXmlDumper::process: handler " << m_handler << " has not been given to the core client";
    return MISSING_DATA;
  }
  
  AnalysisGraph* graph = static_cast<AnalysisGraph*>(analysis.getData(m_graph));
  if (graph == 0)
  {
    graph = new AnalysisGraph(m_graph,m_language,true,true);
    analysis.setData(m_graph,graph);
  }

  SyntacticData* syntacticData = static_cast<SyntacticData*>(analysis.getData("SyntacticData"));
  if (syntacticData == 0)
  {
    syntacticData = new SyntacticAnalysis::SyntacticData(static_cast<AnalysisGraph*>(analysis.getData(m_graph)),0);
    syntacticData->setupDependencyGraph();
    analysis.setData("SyntacticData",syntacticData);
  }

  AnnotationData* annotationData = static_cast< AnnotationData* >(analysis.getData("AnnotationData"));
  if (annotationData == 0)
  {
    annotationData = new AnnotationData();
    if (static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph")) != 0)
    {
      static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"))->populateAnnotationGraph(annotationData, "AnalysisGraph");
    }
    analysis.setData("AnnotationData",annotationData);
  }

  handler->startAnalysis();
  HandlerStreamBuf hsb(handler);
  std::ostream outputStream(&hsb);

  LDEBUG << "EasyXmlDumper:: process before printing heading";
  AnalysisGraph* anaGraph = static_cast<AnalysisGraph*>(analysis.getData("AnalysisGraph"));
  AnalysisGraph* posGraph = static_cast<AnalysisGraph*>(analysis.getData("PosGraph"));
  if (anaGraph != 0 && posGraph != 0)
  {
    LDEBUG << "EasyXmlDumper:: begin of posgraph";
    std::vector< bool > alreadyDumpedTokens;
    std::map< LinguisticAnalysisStructure::Token*, uint64_t > fullTokens;
    LinguisticGraphVertexIt i, i_end;
    uint64_t id = 0;
    alreadyDumpedTokens.resize(num_vertices(*posGraph->getGraph()));
    for (boost::tie(i, i_end) = vertices(*posGraph->getGraph()); i != i_end; ++i)
    {
      LDEBUG << "EasyXmlDumper:: examine posgraph for " << id;
      alreadyDumpedTokens[id] = false;
      fullTokens[get(vertex_token, *posGraph->getGraph(), *i)] = id;
      id++;
    }
    /* No need for sentence boundaries in Easy input
    LinguisticGraphVertex sentenceBegin = sb->getStartVertex();
    SegmentationData::iterator sbItr = sb->begin();
    LinguisticGraphVertex sentenceBegin = sb->getStartVertex();
    SegmentationData::iterator sbItr = sb->begin();
    */
    LinguisticGraphVertex sentenceBegin = posGraph->firstVertex();
    LinguisticGraphVertex sentenceEnd = posGraph->lastVertex();
    string sentIdPrefix;
    try {
      sentIdPrefix = metadata->getMetaData("docid");
      LDEBUG << "EasyXmlDumper:: retrieve sentence id " << sentIdPrefix;
    }catch (LinguisticProcessingException& ) {
      sentIdPrefix = "";
    }
    if(sentIdPrefix.length() <= 0)
      sentIdPrefix = "E";
    /* No need for sentence boundaries in Easy input
    while (sbItr != sb->end())
    {
      LinguisticGraphVertex sentenceEnd = *sbItr;
    */
    LDEBUG << "EasyXmlDumper:: inside posgraph while ";
    dumpLimaData(outputStream,
                  sentenceBegin,
                  sentenceEnd,
                  *anaGraph,
                  *posGraph,
                  *annotationData,
                  *syntacticData,
                  "PosGraph",
                  alreadyDumpedTokens,
                  fullTokens,
                  sentIdPrefix);
    /* No need for sentence boundaries in Easy input
      sentenceBegin = sentenceEnd;
      sbItr++;
    }
    */
    LDEBUG << "EasyXmlDumper:: end of posgraph";
  }

  return SUCCESS_ID;
}
bool DisambiguateWith::operator()(const AnalysisGraph& graph,
                            const LinguisticGraphVertex& v1,
                            const LinguisticGraphVertex& v2,
                            AnalysisContent& analysis) const
{
/*
  Critical function : comment logging messages
*/
  SAPLOGINIT;
  LDEBUG << "DisambiguateWith " << v1 << ", " << v2 << LENDL;
  SyntacticData* syntacticData=static_cast<SyntacticData*>(analysis.getData("SyntacticData"));
  SyntacticData::Relation oldRelation = syntacticData->relationStoredForSelectionalConstraint();
  FsaStringsPool& sp=Common::MediaticData::MediaticData::changeable().stringsPool(m_language);
  
  
  if (v1 == graph.firstVertex() || v1 == graph.lastVertex() ||
    v2 == graph.firstVertex() || v2 == graph.lastVertex() )
  {
    //     LDEBUG << "SecondUngovernedBy: false" << LENDL;
    return false;
  }
  // collect all data necessary
  // dummy implementation:
  //   get old target (noun), get old target COD target (verb), get current node
  //   get current det source of detsub if any
  AnalysisGraph* posgraph=static_cast<AnalysisGraph*>(analysis.getData("PosGraph"));
  if (posgraph==0)
  {
    LERROR << "no graph 'PosGraph' available !" << LENDL;
    return MISSING_DATA;
  }
  LinguisticGraph* lingGraph = const_cast<LinguisticGraph*>(posgraph->getGraph());
  //   LDEBUG << "There is " << out_degree(v2, *lingGraph) << " edges out of " << v2 << LENDL;
//   VertexTokenPropertyMap tokenMap = get(vertex_token, *lingGraph);
  VertexDataPropertyMap dataMap = get(vertex_data, *lingGraph);
//   Token* v1Token = tokenMap[v1];
  MorphoSyntacticData* v1Data = dataMap[v1];
//   Token* v2Token = tokenMap[v2];
  MorphoSyntacticData* v2Data = dataMap[v2];
  std::string rel = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getSyntacticRelationName(m_relation);
//   Token* ov1Token = tokenMap[oldRelation.get<0>()];
  MorphoSyntacticData* ov1Data = dataMap[oldRelation.get<0>()];
//   Token* ov2Token = tokenMap[oldRelation.get<1>()];
  MorphoSyntacticData* ov2Data = dataMap[oldRelation.get<1>()];
  std::string orel = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getSyntacticRelationName(oldRelation.get<2>());
  
  // compute the preferred attachment
  // (if no clear preference, choose the closest left attachment)
  // dummy implementation
  //   if verb = manger and old target = food and det = "" and current is food
  //   then choose old relation
  //   else (at least if current is location) choose new one
  double preference = 0;
  const PropertyAccessor& macroAccessor = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getPropertyCodeManager().getPropertyAccessor("MACRO");
  
  LDEBUG << "DisambiguateWith " << Common::Misc::limastring2utf8stdstring(sp[*(v2Data->allLemma().begin())])
      << ", " << Common::Misc::limastring2utf8stdstring(sp[*(ov2Data->allLemma().begin())])
      << ", " << Common::Misc::limastring2utf8stdstring(sp[*(v1Data->allLemma().begin())]) << LENDL;
  std::string oldRelationName = static_cast<const Common::MediaticData::LanguageData&>(Common::MediaticData::MediaticData::single().mediaData(m_language)).getSyntacticRelationName(oldRelation.get<2>());
  double oldProba = m_preferences->dependencyProbability(Common::Misc::limastring2utf8stdstring(sp[*(ov2Data->allLemma().begin())]),
                                                         *(ov2Data->allValues(macroAccessor).begin()),
                                                         oldRelationName,
                                                         Common::Misc::limastring2utf8stdstring(sp[*(ov1Data->allLemma().begin())]),
                                                         *(ov1Data->allValues(macroAccessor).begin()));
  double newProba = m_preferences->dependencyProbability(Common::Misc::limastring2utf8stdstring(sp[*(v2Data->allLemma().begin())]),
                                                         *(v2Data->allValues(macroAccessor).begin()),
                                                        Common::Misc::limastring2utf8stdstring(m_complement),
                                                        Common::Misc::limastring2utf8stdstring(sp[*(v1Data->allLemma().begin())]),
                                                         *(v1Data->allValues(macroAccessor).begin()));

/*  if ( !( (Common::Misc::limastring2utf8stdstring(sp[*(v2Data->allLemma().begin())]) == "manger")
    && (Common::Misc::limastring2utf8stdstring(sp[*(ov2Data->allLemma().begin())]) == "bonbon")
    && (Common::Misc::limastring2utf8stdstring(sp[*(v1Data->allLemma().begin())]) == "fraise")
    ) )*/
  LDEBUG << "Old proba=" << oldProba << "; new proba=" << newProba << LENDL;
  preference = newProba - oldProba;
  
  // if old one: don't do anything
  // else if new one, remove the old dependency, return true
  if (preference > 0) /// @TODO implement the test
  {
    return syntacticData->removeDependency(oldRelation.get<0>(), oldRelation.get<1>(), oldRelation.get<2>());
  }
  
  return false;
}