Пример #1
0
//! populate this InputType with data from in stream
int
TreeInput::
Read(std::istream& in, const std::vector<FactorType>& factorOrder,
     AllOptions const& opts)
{
    const StaticData &staticData = StaticData::Instance();

    string line;
    if (getline(in, line, '\n').eof())
        return 0;
    // remove extra spaces
    //line = Trim(line);

    m_labelledSpans.clear();
    ProcessAndStripXMLTags(line, m_labelledSpans, m_xmlOptions);

    // do words 1st - hack
    stringstream strme;
    strme << line << endl;

    Sentence::Read(strme, factorOrder, opts);

    // size input chart
    size_t sourceSize = GetSize();
    m_sourceChart.resize(sourceSize);

    for (size_t pos = 0; pos < sourceSize; ++pos) {
        m_sourceChart[pos].resize(sourceSize - pos);
    }

    // do source labels
    vector<XMLParseOutput>::const_iterator iterLabel;
    for (iterLabel = m_labelledSpans.begin(); iterLabel != m_labelledSpans.end(); ++iterLabel) {
        const XMLParseOutput &labelItem = *iterLabel;
        const Range &range = labelItem.m_range;
        const string &label = labelItem.m_label;
        AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder);
    }

    // default label
    for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
        for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
            NonTerminalSet &list = GetLabelSet(startPos, endPos);
            if (list.size() == 0 || !staticData.GetDefaultNonTermOnlyForEmptyRange()) {
                AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
            }
        }
    }

    return 1;
}
Пример #2
0
std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
{
  m_line = line;
  m_tree.Clear();
  try {
    if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
      throw Exception("");
    }
  } catch (const XmlException &e) {
    throw Exception(e.getMsg());
  }
  m_tree.ConnectNodes();
  SyntaxNode *root = m_tree.GetTop();
  assert(root);
  m_words = tokenize(m_line.c_str());
  return ConvertTree(*root, m_words);
}
bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
  if (!m_options.sourceSyntax) {
    return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
  }

  string sourceStringCPP(sourceString);
  try {
    ProcessAndStripXMLTags(sourceStringCPP, sourceTree,
                           m_sourceLabelCollection ,
                           m_sourceTopLabelCollection);
  } catch (const XmlException & e) {
    std::cerr << "WARNING: failed to process source sentence at line "
              << sentenceID << ": " << e.getMsg() << std::endl;
    return false;
  }
  source = tokenize(sourceStringCPP.c_str());
  return true;
}
bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
  if (!m_options.targetSyntax) {
    return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
  }

  string targetStringCPP(targetString);
  try {
    ProcessAndStripXMLTags(targetStringCPP, targetTree,
                           m_targetLabelCollection,
                           m_targetTopLabelCollection);
  } catch (const XmlException & e) {
    std::cerr << "WARNING: failed to process target sentence at line "
              << sentenceID << ": " << e.getMsg() << std::endl;
    return false;
  }
  target = tokenize(targetStringCPP.c_str());
  return true;
}
Пример #5
0
int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
  // const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
  std::string line;
  std::map<std::string, std::string> meta;

  if (getline(in, line, '\n').eof())
    return 0;

  //get covered words - if continual-partial-translation is switched on, parse input
  const StaticData &staticData = StaticData::Instance();
  m_frontSpanCoveredLength = 0;
  m_sourceCompleted.resize(0);
  if (staticData.ContinuePartialTranslation()) {
    string initialTargetPhrase;
    string sourceCompletedStr;
    int loc1 = line.find( "|||", 0 );
    int loc2 = line.find( "|||", loc1 + 3 );
    if (loc1 > -1 && loc2 > -1) {
      initialTargetPhrase = line.substr(0, loc1);
      sourceCompletedStr = line.substr(loc1 + 3, loc2 - loc1 - 3);
      line = line.substr(loc2 + 3);
      sourceCompletedStr = Trim(sourceCompletedStr);
      initialTargetPhrase = Trim(initialTargetPhrase);
      m_initialTargetPhrase = initialTargetPhrase;
      int len = sourceCompletedStr.size();
      m_sourceCompleted.resize(len);
      int contiguous = 1;
      for (int i = 0; i < len; ++i) {
        if (sourceCompletedStr.at(i) == '1') {
          m_sourceCompleted[i] = true;
          if (contiguous)
            m_frontSpanCoveredLength ++;
        } else {
          m_sourceCompleted[i] = false;
          contiguous = 0;
        }
      }
    }
  }

  // remove extra spaces
  line = Trim(line);

  // if sentences is specified as "<seg id=1> ... </seg>", extract id
  meta = ProcessAndStripSGML(line);
  if (meta.find("id") != meta.end()) {
    this->SetTranslationId(atol(meta["id"].c_str()));
  }
  if (meta.find("docid") != meta.end()) {
    this->SetDocumentId(atol(meta["docid"].c_str()));
    this->SetUseTopicId(false);
    this->SetUseTopicIdAndProb(false);
  }
  if (meta.find("topic") != meta.end()) {
    vector<string> topic_params;
    boost::split(topic_params, meta["topic"], boost::is_any_of("\t "));
    if (topic_params.size() == 1) {
      this->SetTopicId(atol(topic_params[0].c_str()));
      this->SetUseTopicId(true);
      this->SetUseTopicIdAndProb(false);
    } else {
      this->SetTopicIdAndProb(topic_params);
      this->SetUseTopicId(false);
      this->SetUseTopicIdAndProb(true);
    }
  }
  if (meta.find("weight-setting") != meta.end()) {
    this->SetWeightSetting(meta["weight-setting"]);
    this->SetSpecifiesWeightSetting(true);
    staticData.SetWeightSetting(meta["weight-setting"]);
  } else {
    this->SetSpecifiesWeightSetting(false);
  }

  // parse XML markup in translation line
  //const StaticData &staticData = StaticData::Instance();
  std::vector< size_t > xmlWalls;
  std::vector< std::pair<size_t, std::string> > placeholders;

  if (staticData.GetXmlInputType() != XmlPassThrough) {
    int offset = 0;
    if (staticData.IsChart()) {
      offset = 1;
    }

    if (!ProcessAndStripXMLTags(line, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders,
                                offset,
                                staticData.GetXmlBrackets().first,
                                staticData.GetXmlBrackets().second)) {
      const string msg("Unable to parse XML in line: " + line);
      TRACE_ERR(msg << endl);
      throw runtime_error(msg);
    }
  }

  // Phrase::CreateFromString(Input, factorOrder, line, factorDelimiter, NULL);
  Phrase::CreateFromString(Input, factorOrder, line, NULL);

  // placeholders
  ProcessPlaceholders(placeholders);

  if (staticData.IsChart()) {
    InitStartEndWord();
  }

  //now that we have final word positions in phrase (from CreateFromString),
  //we can make input phrase objects to go with our XmlOptions and create TranslationOptions

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    for (size_t i=0; i<GetSize(); i++) {
      m_xmlCoverageMap.push_back(false);
    }

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
        iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {

      const XmlOption *xmlOption = *iterXmlOpts;
      const WordsRange &range = xmlOption->range;

      for(size_t j=range.GetStartPos(); j<=range.GetEndPos(); j++) {
        m_xmlCoverageMap[j]=true;
      }
    }

  }

  // reordering walls and zones
  m_reorderingConstraint.InitializeWalls( GetSize() );

  // set reordering walls, if "-monotone-at-punction" is set
  if (staticData.UseReorderingConstraint() && GetSize()>0) {
    m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) );
  }

  // set walls obtained from xml
  for(size_t i=0; i<xmlWalls.size(); i++)
    if( xmlWalls[i] < GetSize() ) // no buggy walls, please
      m_reorderingConstraint.SetWall( xmlWalls[i], true );
  m_reorderingConstraint.FinalizeWalls();

  return 1;
}
Пример #6
0
//! populate this InputType with data from in stream
int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
  const StaticData &staticData = StaticData::Instance();

  string line;
  if (getline(in, line, '\n').eof())
    return 0;
  // remove extra spaces
  //line = Trim(line);

  std::vector<XMLParseOutput> sourceLabels;
  std::vector<XmlOption*> xmlOptionsList;
  ProcessAndStripXMLTags(line, sourceLabels, xmlOptionsList);

  // do words 1st - hack
  stringstream strme;
  strme << line << endl;

  Sentence::Read(strme, factorOrder);

  // size input chart
  size_t sourceSize = GetSize();
  m_sourceChart.resize(sourceSize);

  for (size_t pos = 0; pos < sourceSize; ++pos) {
    m_sourceChart[pos].resize(sourceSize - pos);
  }

  // do source labels
  vector<XMLParseOutput>::const_iterator iterLabel;
  for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) {
    const XMLParseOutput &labelItem = *iterLabel;
    const WordsRange &range = labelItem.m_range;
    const string &label = labelItem.m_label;
    AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder);
  }

  // default label
  for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
    for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
      AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
    }
  }

  // XML Options

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    //TODO: needed to handle exclusive
    //for (size_t i=0; i<GetSize(); i++) {
    //  m_xmlCoverageMap.push_back(false);
    //}

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = xmlOptionsList.begin();
        iterXmlOpts != xmlOptionsList.end(); iterXmlOpts++) {

      const XmlOption *xmlOption = *iterXmlOpts;
      TargetPhrase *targetPhrase = new TargetPhrase(xmlOption->targetPhrase);
      *targetPhrase = xmlOption->targetPhrase; // copy everything
      WordsRange *range = new WordsRange(xmlOption->range);
      const StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted

      TargetPhraseCollection *tpc = new TargetPhraseCollection;
      tpc->Add(targetPhrase);

      ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
      m_xmlChartOptionsList.push_back(transOpt);

      //TODO: needed to handle exclusive
      //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
      //  m_xmlCoverageMap[j]=true;
      //}

      delete xmlOption;
    }

  }

  return 1;
}