/*
 * Given a previous state, compute Bleu score for the updated state with an additional target
 * phrase translated.
 */
FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo,
                                    const FFState* prev_state,
                                    ScoreComponentCollection* accumulator) const
{
    if (!m_enabled) return new BleuScoreState();

    NGrams::const_iterator reference_ngrams_iter;
    const BleuScoreState& ps = dynamic_cast<const BleuScoreState&>(*prev_state);
    BleuScoreState* new_state = new BleuScoreState(ps);

    float old_bleu, new_bleu;
    size_t num_new_words, ctx_start_idx, ctx_end_idx;

    // Calculate old bleu;
    old_bleu = CalculateBleu(new_state);

    // Get context and append new words.
    num_new_words = cur_hypo.GetCurrTargetLength();
    if (num_new_words == 0) {
        return new_state;
    }

    Phrase new_words = ps.m_words;
    new_words.Append(cur_hypo.GetCurrTargetPhrase());
    //cerr << "NW: " << new_words << endl;

    // get ngram matches for new words
    GetNgramMatchCounts(new_words,
                        m_cur_ref_ngrams,
                        new_state->m_ngram_counts,
                        new_state->m_ngram_matches,
                        new_state->m_words.GetSize()); // number of words in previous states

    // Update state variables
    ctx_end_idx = new_words.GetSize()-1;
    size_t bleu_context_length = BleuScoreState::bleu_order -1;
    if (ctx_end_idx > bleu_context_length) {
        ctx_start_idx = ctx_end_idx - bleu_context_length;
    } else {
        ctx_start_idx = 0;
    }

    WordsBitmap coverageVector = cur_hypo.GetWordsBitmap();
    new_state->m_source_length = coverageVector.GetNumWordsCovered();

    new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx,
                         ctx_end_idx));
    new_state->m_target_length += cur_hypo.GetCurrTargetLength();

    // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase
    new_state->m_scaled_ref_length = m_cur_ref_length *
                                     ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize());

    // Calculate new bleu.
    new_bleu = CalculateBleu(new_state);

    // Set score to new Bleu score
    accumulator->PlusEquals(this, new_bleu - old_bleu);
    return new_state;
}
/** constructor; just initialize the base class */
TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold)
  : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold)
{
  size_t size = input.GetSize();
  m_inputPathMatrix.resize(size);
  for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) {
    for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) {
      size_t endPos = startPos + phaseSize -1;
      vector<InputPath*> &vec = m_inputPathMatrix[startPos];

      WordsRange range(startPos, endPos);
      Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos)));
      const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);

      InputPath *node;
      if (range.GetNumWordsCovered() == 1) {
        node = new InputPath(subphrase, labels, range, NULL, NULL);
        vec.push_back(node);
      } else {
        const InputPath &prevNode = GetInputPath(startPos, endPos - 1);
        node = new InputPath(subphrase, labels, range, &prevNode, NULL);
        vec.push_back(node);
      }

      m_phraseDictionaryQueue.push_back(node);
    }
  }
}
// score ngrams around the overlap of two previously scored phrases
void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase,
        const NGrams& ref_ngram_counts,
        std::vector< size_t >& ret_counts,
        std::vector< size_t >& ret_matches,
        size_t overlap_index) const
{
    NGrams::const_iterator ref_ngram_counts_iter;
    size_t ngram_start_idx, ngram_end_idx;

    // Chiang et al (2008) use unclipped counts of ngram matches
    for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) {
        if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break;
        for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
            if (order > end_idx) break;

            ngram_end_idx = end_idx;
            ngram_start_idx = end_idx - order;
            if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point

            Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
            ret_counts[order]++;

            ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
            if (ref_ngram_counts_iter != ref_ngram_counts.end())
                ret_matches[order]++;
        }
    }
}
/*
 * Given a phrase (current translation) calculate its ngram counts and
 * its ngram matches against the ngrams in the reference translation
 */
void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase,
        const NGrams& ref_ngram_counts,
        std::vector< size_t >& ret_counts,
        std::vector< size_t >& ret_matches,
        size_t skip_first) const
{
    NGrams::const_iterator ref_ngram_counts_iter;
    size_t ngram_start_idx, ngram_end_idx;

    // Chiang et al (2008) use unclipped counts of ngram matches
    for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
        for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
            if (order > end_idx) break;

            ngram_end_idx = end_idx;
            ngram_start_idx = end_idx - order;

            Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
            ret_counts[order]++;

            ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
            if (ref_ngram_counts_iter != ref_ngram_counts.end())
                ret_matches[order]++;
        }
    }
}
// score ngrams of words that have been added before the previous word span
void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase,
        const NGrams& ref_ngram_counts,
        std::vector< size_t >& ret_counts,
        std::vector< size_t >& ret_matches,
        size_t new_start_indices,
        size_t last_end_index) const
{
    NGrams::const_iterator ref_ngram_counts_iter;
    size_t ngram_start_idx, ngram_end_idx;

    // Chiang et al (2008) use unclipped counts of ngram matches
    for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) {
        for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
            ngram_start_idx = start_idx;
            ngram_end_idx = start_idx + order;
            if (order > ngram_end_idx) break;
            if (ngram_end_idx > last_end_index) break;

            Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
            ret_counts[order]++;

            ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
            if (ref_ngram_counts_iter != ref_ngram_counts.end())
                ret_matches[order]++;
        }
    }
}
Example #6
0
void ChartParser::CreateInputPaths(const InputType &input)
{
  size_t size = input.GetSize();
  m_inputPathMatrix.resize(size);

  UTIL_THROW_IF2(input.GetType() != SentenceInput && input.GetType() != TreeInputType,
		  "Input must be a sentence or a tree, not lattice or confusion networks");
  for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) {
    for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) {
      size_t endPos = startPos + phaseSize -1;
      vector<InputPath*> &vec = m_inputPathMatrix[startPos];

      WordsRange range(startPos, endPos);
      Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos)));
      const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos);

      InputPath *node;
      if (range.GetNumWordsCovered() == 1) {
        node = new InputPath(subphrase, labels, range, NULL, NULL);
        vec.push_back(node);
      } else {
        const InputPath &prevNode = GetInputPath(startPos, endPos - 1);
        node = new InputPath(subphrase, labels, range, &prevNode, NULL);
        vec.push_back(node);
      }

      //m_inputPathQueue.push_back(node);
    }
  }
}
std::vector<float>  LexicalReorderingTableMemory::GetScore(const Phrase& f,
    const Phrase& e,
    const Phrase& c)
{
  //rather complicated because of const can't use []... as [] might enter new things into std::map
  //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large
  TableType::const_iterator r;
  std::string key;
  if(0 == c.GetSize()) {
    key = MakeKey(f,e,c);
    r = m_Table.find(key);
    if(m_Table.end() != r) {
      return r->second;
    }
  } else {
    //right try from large to smaller context
    for(size_t i = 0; i <= c.GetSize(); ++i) {
      Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
      key = MakeKey(f,e,sub_c);
      r = m_Table.find(key);
      if(m_Table.end() != r) {
        return r->second;
      }
    }
  }
  return Scores();
}
std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f,
    const Phrase& e,
    const Phrase& c)
{
  std::string key;
  Scores scores;
  
  if(0 == c.GetSize())
    key = MakeKey(f, e, c);
  else
    for(size_t i = 0; i <= c.GetSize(); ++i)
    {
      Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1)));
      key = MakeKey(f,e,sub_c);
    }
    
  size_t index = m_hash[key];
  if(m_hash.GetSize() != index)
  {
    std::string scoresString;
    if(m_inMemory)
      scoresString = m_scoresMemory[index];
    else
      scoresString = m_scoresMapped[index];
      
    BitWrapper<> bitStream(scoresString);
    for(size_t i = 0; i < m_numScoreComponent; i++)
      scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream));

    return scores;
  }

  return Scores();
}
WordsRange TrellisPath::GetTargetWordsRange(const Hypothesis &hypo) const
{
  size_t startPos = 0;

  for (int indEdge = (int) m_path.size() - 1 ; indEdge >= 0 ; --indEdge) {
    const Hypothesis *currHypo = m_path[indEdge];
    size_t endPos = startPos + currHypo->GetCurrTargetLength() - 1;

    if (currHypo == &hypo) {
      return WordsRange(startPos, endPos);
    }
    startPos = endPos + 1;
  }

  // have to give a hypo in the trellis path, but u didn't.
  CHECK(false);
  return WordsRange(NOT_FOUND, NOT_FOUND);
}
void LexicalReorderingTableTree::Cache(const Sentence& input){
  //only works with sentences...
  size_t prev_cache_size = m_Cache.size();
  size_t max_phrase_length = input.GetSize();
  for(size_t len = 0; len <= max_phrase_length; ++len){ 
	for(size_t start = 0; start+len <= input.GetSize(); ++start){
	  Phrase f    = input.GetSubString(WordsRange(start, start+len));
	  auxCacheForSrcPhrase(f);
	}
  }
  std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n"; 
}
/** Score due to flip. Again, left and right refer to order on the <emph>target</emph> side. */
void LexicalReorderingFeatureFunction::doFlipUpdate(
  const TranslationOption* leftOption,
  const TranslationOption* rightOption, 
  const TargetGap& leftGap, const TargetGap& rightGap, FVector& scores) {
  if (leftGap.segment.GetEndPos() + 1 == rightGap.segment.GetStartPos()) {
    TargetGap gap(leftGap.leftHypo,rightGap.rightHypo,
      WordsRange(leftGap.segment.GetStartPos(),rightGap.segment.GetEndPos()));
    doContiguousPairedUpdate(leftOption,rightOption,gap,scores);
  } else {
    doDiscontiguousPairedUpdate(leftOption,rightOption,leftGap,rightGap,scores);
  }
}
const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &input) const
{
  // fake previous translated phrase start and end
  size_t start = NOT_FOUND;
  size_t end = NOT_FOUND;
  if (input.m_frontSpanCoveredLength > 0) {
    // can happen with --continue-partial-translation
    start = 0;
    end = input.m_frontSpanCoveredLength -1;
  }
  return new DistortionState_traditional(
           WordsRange(start, end),
           NOT_FOUND);
}
/** Score due to flip. Again, left and right refer to order on the <emph>target</emph> side. */
void DiscriminativeLMBigramFeatureFunction::doFlipUpdate(const TranslationOption* leftOption,const TranslationOption* rightOption, 
        const TargetGap& leftGap, const TargetGap& rightGap, FVector& scores) 
{
    if (leftGap.segment.GetEndPos()+1 == rightGap.segment.GetStartPos()) {
        //contiguous
        Phrase gapPhrase(leftOption->GetTargetPhrase());
        gapPhrase.Append(rightOption->GetTargetPhrase());
        TargetGap gap(leftGap.leftHypo, rightGap.rightHypo, 
                      WordsRange(leftGap.segment.GetStartPos(), rightGap.segment.GetEndPos()));
        doUpdate(gapPhrase,gap,scores);
    } else {
        //discontiguous
        doUpdate(leftOption->GetTargetPhrase(), leftGap,scores);
        doUpdate(rightOption->GetTargetPhrase(), rightGap,scores);
    }
}
void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase,
        const NGrams& ref_ngram_counts,
        std::vector< size_t >& ret_counts,
        std::vector< size_t >& ret_matches,
        size_t skip_first) const
{
    NGrams::const_iterator ref_ngram_counts_iter;
    size_t ngram_start_idx, ngram_end_idx;

    Matches ngram_matches;
    for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) {
        for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
            if (order > end_idx) break;

            ngram_end_idx = end_idx;
            ngram_start_idx = end_idx - order;

            Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0);
            ret_counts[order]++;

            ref_ngram_counts_iter = ref_ngram_counts.find(ngram);
            if (ref_ngram_counts_iter != ref_ngram_counts.end()) {
                ngram_matches[order][ngram]++;
            }
        }
    }

    // clip ngram matches
    for (size_t order = 0; order < BleuScoreState::bleu_order; order++) {
        NGrams::const_iterator iter;

        // iterate over ngram counts for every ngram order
        for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) {
            ref_ngram_counts_iter = ref_ngram_counts.find(iter->first);
            if (iter->second > ref_ngram_counts_iter->second) {
                ret_matches[order] += ref_ngram_counts_iter->second;
            } else {
                ret_matches[order] += iter->second;
            }
        }
    }
}
void ChartRuleLookupManagerOnDisk::GetChartRuleCollection(
  const WordsRange &range,
  bool adhereTableLimit,
  ChartTranslationOptionList &outColl)
{
  const StaticData &staticData = StaticData::Instance();
  size_t rulesLimit = staticData.GetRuleLimit();

  size_t relEndPos = range.GetEndPos() - range.GetStartPos();
  size_t absEndPos = range.GetEndPos();

  // MAIN LOOP. create list of nodes of target phrases
  DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()];

  // sort save nodes so only do nodes with most counts
  expandableDottedRuleList.SortSavedNodes();

  const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl();
  //cerr << "savedNodeColl=" << savedNodeColl.size() << " ";

  const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();

  for (size_t ind = 0; ind < (savedNodeColl.size()) ; ++ind) {
    const SavedNodeOnDisk &savedNode = *savedNodeColl[ind];

    const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule();
    const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();
    size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1;

    // search for terminal symbol
    if (startPos == absEndPos) {
      OnDiskPt::Word *sourceWordBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceWordLabel.GetLabel());

      if (sourceWordBerkeleyDb != NULL) {
        const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper);
        if (node != NULL) {
          // TODO figure out why source word is needed from node, not from sentence
          // prob to do with factors or non-term
          //const Word &sourceWord = node->GetSourceWord();
          DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, sourceWordLabel, prevDottedRule);
          expandableDottedRuleList.Add(relEndPos+1, dottedRule);

          // cache for cleanup
          m_sourcePhraseNode.push_back(node);
        }

        delete sourceWordBerkeleyDb;
      }
    }

    // search for non-terminals
    size_t endPos, stackInd;
    if (startPos > absEndPos)
      continue;
    else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) {
      // start.
      endPos = absEndPos - 1;
      stackInd = relEndPos;
    } else {
      endPos = absEndPos;
      stackInd = relEndPos + 1;
    }

    // size_t nonTermNumWordsCovered = endPos - startPos + 1;

    // get target nonterminals in this span from chart
    const ChartCellLabelSet &chartNonTermSet =
      GetCellCollection().Get(WordsRange(startPos, endPos)).GetTargetLabelSet();

    //const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal()
    //                                   ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal();

    // go through each SOURCE lhs
    const NonTerminalSet &sourceLHSSet = GetSentence().GetLabelSet(startPos, endPos);

    NonTerminalSet::const_iterator iterSourceLHS;
    for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) {
      const Word &sourceLHS = *iterSourceLHS;

      OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS);

      if (sourceLHSBerkeleyDb == NULL) {
        delete sourceLHSBerkeleyDb;
        continue; // vocab not in pt. node definately won't be in there
      }

      const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
      delete sourceLHSBerkeleyDb;

      if (sourceNode == NULL)
        continue; // didn't find source node

      // go through each TARGET lhs
      ChartCellLabelSet::const_iterator iterChartNonTerm;
      for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) {
        const ChartCellLabel &cellLabel = *iterChartNonTerm;

        //cerr << sourceLHS << " " << defaultSourceNonTerm << " " << chartNonTerm << " " << defaultTargetNonTerm << endl;

        //bool isSyntaxNonTerm = (sourceLHS != defaultSourceNonTerm) || (chartNonTerm != defaultTargetNonTerm);
        bool doSearch = true; //isSyntaxNonTerm ? nonTermNumWordsCovered <=  maxSyntaxSpan :
        //						nonTermNumWordsCovered <= maxDefaultSpan;

        if (doSearch) {

          OnDiskPt::Word *chartNonTermBerkeleyDb = m_dbWrapper.ConvertFromMoses(Output, m_outputFactorsVec, cellLabel.GetLabel());

          if (chartNonTermBerkeleyDb == NULL)
            continue;

          const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper);
          delete chartNonTermBerkeleyDb;

          if (node == NULL)
            continue;

          // found matching entry
          //const Word &sourceWord = node->GetSourceWord();
          DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, cellLabel, prevDottedRule);
          expandableDottedRuleList.Add(stackInd, dottedRule);

          m_sourcePhraseNode.push_back(node);
        }
      } // for (iterChartNonTerm

      delete sourceNode;

    } // for (iterLabelListf

    // return list of target phrases
    DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1);

    // source LHS
    DottedRuleCollOnDisk::const_iterator iterDottedRuleColl;
    for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) {
      // node of last source word
      const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl;
      if (prevDottedRule.Done())
        continue;
      prevDottedRule.Done(true);

      const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode();

      //get node for each source LHS
      const NonTerminalSet &lhsSet = GetSentence().GetLabelSet(range.GetStartPos(), range.GetEndPos());
      NonTerminalSet::const_iterator iterLabelSet;
      for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) {
        const Word &sourceLHS = *iterLabelSet;

        OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS);
        if (sourceLHSBerkeleyDb == NULL)
          continue;

        const TargetPhraseCollection *targetPhraseCollection = NULL;
        const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper);
        if (node) {
          UINT64 tpCollFilePos = node->GetValue();
          std::map<UINT64, const TargetPhraseCollection*>::const_iterator iterCache = m_cache.find(tpCollFilePos);
          if (iterCache == m_cache.end()) {

            const OnDiskPt::TargetPhraseCollection *tpcollBerkeleyDb = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper);

            targetPhraseCollection
            = tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec
                                               ,m_outputFactorsVec
                                               ,m_dictionary
                                               ,m_weight
                                               ,m_wpProducer
                                               ,*m_languageModels
                                               ,m_filePath
                                               , m_dbWrapper.GetVocab());

            delete tpcollBerkeleyDb;
            m_cache[tpCollFilePos] = targetPhraseCollection;
          } else {
            // just get out of cache
            targetPhraseCollection = iterCache->second;
          }

          assert(targetPhraseCollection);
          if (!targetPhraseCollection->IsEmpty()) {
            outColl.Add(*targetPhraseCollection, prevDottedRule,
                        GetCellCollection(), adhereTableLimit, rulesLimit);
          }

        } // if (node)

        delete node;
        delete sourceLHSBerkeleyDb;
      }
    }
  } // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind)

  outColl.CreateChartRules(rulesLimit);

  //cerr << numDerivations << " ";
}
Example #16
0
int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
  // const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
  std::string line;
  std::map<std::string, std::string> meta;

  if (getline(in, line, '\n').eof())
    return 0;

  //get covered words - if continual-partial-translation is switched on, parse input
  const StaticData &staticData = StaticData::Instance();
  m_frontSpanCoveredLength = 0;
  m_sourceCompleted.resize(0);
  if (staticData.ContinuePartialTranslation()) {
    string initialTargetPhrase;
    string sourceCompletedStr;
    int loc1 = line.find( "|||", 0 );
    int loc2 = line.find( "|||", loc1 + 3 );
    if (loc1 > -1 && loc2 > -1) {
      initialTargetPhrase = line.substr(0, loc1);
      sourceCompletedStr = line.substr(loc1 + 3, loc2 - loc1 - 3);
      line = line.substr(loc2 + 3);
      sourceCompletedStr = Trim(sourceCompletedStr);
      initialTargetPhrase = Trim(initialTargetPhrase);
      m_initialTargetPhrase = initialTargetPhrase;
      int len = sourceCompletedStr.size();
      m_sourceCompleted.resize(len);
      int contiguous = 1;
      for (int i = 0; i < len; ++i) {
        if (sourceCompletedStr.at(i) == '1') {
          m_sourceCompleted[i] = true;
          if (contiguous)
            m_frontSpanCoveredLength ++;
        } else {
          m_sourceCompleted[i] = false;
          contiguous = 0;
        }
      }
    }
  }

  // remove extra spaces
  line = Trim(line);

  // if sentences is specified as "<seg id=1> ... </seg>", extract id
  meta = ProcessAndStripSGML(line);
  if (meta.find("id") != meta.end()) {
    this->SetTranslationId(atol(meta["id"].c_str()));
  }
  if (meta.find("docid") != meta.end()) {
    this->SetDocumentId(atol(meta["docid"].c_str()));
    this->SetUseTopicId(false);
    this->SetUseTopicIdAndProb(false);
  }
  if (meta.find("topic") != meta.end()) {
    vector<string> topic_params;
    boost::split(topic_params, meta["topic"], boost::is_any_of("\t "));
    if (topic_params.size() == 1) {
      this->SetTopicId(atol(topic_params[0].c_str()));
      this->SetUseTopicId(true);
      this->SetUseTopicIdAndProb(false);
    } else {
      this->SetTopicIdAndProb(topic_params);
      this->SetUseTopicId(false);
      this->SetUseTopicIdAndProb(true);
    }
  }
  if (meta.find("weight-setting") != meta.end()) {
    this->SetWeightSetting(meta["weight-setting"]);
    this->SetSpecifiesWeightSetting(true);
    staticData.SetWeightSetting(meta["weight-setting"]);
  } else {
    this->SetSpecifiesWeightSetting(false);
  }

  // parse XML markup in translation line
  //const StaticData &staticData = StaticData::Instance();
  std::vector< size_t > xmlWalls;
  std::vector< std::pair<size_t, std::string> > placeholders;

  if (staticData.GetXmlInputType() != XmlPassThrough) {
    int offset = 0;
    if (staticData.IsChart()) {
      offset = 1;
    }

    if (!ProcessAndStripXMLTags(line, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders,
                                offset,
                                staticData.GetXmlBrackets().first,
                                staticData.GetXmlBrackets().second)) {
      const string msg("Unable to parse XML in line: " + line);
      TRACE_ERR(msg << endl);
      throw runtime_error(msg);
    }
  }

  // Phrase::CreateFromString(Input, factorOrder, line, factorDelimiter, NULL);
  Phrase::CreateFromString(Input, factorOrder, line, NULL);

  // placeholders
  ProcessPlaceholders(placeholders);

  if (staticData.IsChart()) {
    InitStartEndWord();
  }

  //now that we have final word positions in phrase (from CreateFromString),
  //we can make input phrase objects to go with our XmlOptions and create TranslationOptions

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    for (size_t i=0; i<GetSize(); i++) {
      m_xmlCoverageMap.push_back(false);
    }

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
        iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {

      const XmlOption *xmlOption = *iterXmlOpts;
      const WordsRange &range = xmlOption->range;

      for(size_t j=range.GetStartPos(); j<=range.GetEndPos(); j++) {
        m_xmlCoverageMap[j]=true;
      }
    }

  }

  // reordering walls and zones
  m_reorderingConstraint.InitializeWalls( GetSize() );

  // set reordering walls, if "-monotone-at-punction" is set
  if (staticData.UseReorderingConstraint() && GetSize()>0) {
    m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) );
  }

  // set walls obtained from xml
  for(size_t i=0; i<xmlWalls.size(); i++)
    if( xmlWalls[i] < GetSize() ) // no buggy walls, please
      m_reorderingConstraint.SetWall( xmlWalls[i], true );
  m_reorderingConstraint.FinalizeWalls();

  return 1;
}
void ChartRuleLookupManagerMemory::GetChartRuleCollection(
  const WordsRange &range,
  ChartTranslationOptionList &outColl)
{
  size_t relEndPos = range.GetEndPos() - range.GetStartPos();
  size_t absEndPos = range.GetEndPos();

  // MAIN LOOP. create list of nodes of target phrases

  // get list of all rules that apply to spans at same starting position
  DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()];
  const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList();
  
  const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel();

  // loop through the rules
  // (note that expandableDottedRuleList can be expanded as the loop runs 
  //  through calls to ExtendPartialRuleApplication())
  for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) {
    // rule we are about to extend
    const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind];
    // we will now try to extend it, starting after where it ended
    size_t startPos = prevDottedRule.IsRoot()
                    ? range.GetStartPos()
                    : prevDottedRule.GetWordsRange().GetEndPos() + 1;

    // search for terminal symbol
    // (if only one more word position needs to be covered)
    if (startPos == absEndPos) {

      // look up in rule dictionary, if the current rule can be extended
      // with the source word in the last position
      const Word &sourceWord = sourceWordLabel.GetLabel();
      const PhraseDictionaryNodeSCFG *node = prevDottedRule.GetLastNode().GetChild(sourceWord);

      // if we found a new rule -> create it and add it to the list
      if (node != NULL) {
				// create the rule
#ifdef USE_BOOST_POOL
        DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc();
        new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel,
                                            prevDottedRule);
#else
        DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node,
                                                                sourceWordLabel,
                                                                prevDottedRule);
#endif
        dottedRuleCol.Add(relEndPos+1, dottedRule);
      }
    }

    // search for non-terminals
    size_t endPos, stackInd;

    // span is already complete covered? nothing can be done
    if (startPos > absEndPos)
      continue;

    else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) {
      // We're at the root of the prefix tree so won't try to cover the full
      // span (i.e. we don't allow non-lexical unary rules).  However, we need
      // to match non-unary rules that begin with a non-terminal child, so we
      // do that in two steps: during this iteration we search for non-terminals
      // that cover all but the last source word in the span (there won't
      // already be running nodes for these because that would have required a
      // non-lexical unary rule match for an earlier span).  Any matches will
      // result in running nodes being appended to the list and on subsequent
      // iterations (for this same span), we'll extend them to cover the final
      // word.
      endPos = absEndPos - 1;
      stackInd = relEndPos;
    }
    else 
    {
      endPos = absEndPos;
      stackInd = relEndPos + 1;
    }


    ExtendPartialRuleApplication(prevDottedRule, startPos, endPos, stackInd,
                                 dottedRuleCol);
  }

  // list of rules that that cover the entire span
  DottedRuleList &rules = dottedRuleCol.Get(relEndPos + 1);

  // look up target sides for the rules
  DottedRuleList::const_iterator iterRule;
  for (iterRule = rules.begin(); iterRule != rules.end(); ++iterRule) {
    const DottedRuleInMemory &dottedRule = **iterRule;
    const PhraseDictionaryNodeSCFG &node = dottedRule.GetLastNode();

    // look up target sides
    const TargetPhraseCollection *tpc = node.GetTargetPhraseCollection();

    // add the fully expanded rule (with lexical target side)
    if (tpc != NULL) {
      AddCompletedRule(dottedRule, *tpc, range, outColl);
    }
  }

  dottedRuleCol.Clear(relEndPos+1);

  outColl.ShrinkToLimit();
}
// Given a partial rule application ending at startPos-1 and given the sets of
// source and target non-terminals covering the span [startPos, endPos],
// determines the full or partial rule applications that can be produced through
// extending the current rule application by a single non-terminal.
void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication(
  const DottedRuleInMemory &prevDottedRule,
  size_t startPos,
  size_t endPos,
  size_t stackInd,
  DottedRuleColl & dottedRuleColl)
{
  // source non-terminal labels for the remainder
  const NonTerminalSet &sourceNonTerms =
    GetSentence().GetLabelSet(startPos, endPos);

  // target non-terminal labels for the remainder
  const ChartCellLabelSet &targetNonTerms =
    GetCellCollection().Get(WordsRange(startPos, endPos)).GetTargetLabelSet();

  // note where it was found in the prefix tree of the rule dictionary
  const PhraseDictionaryNodeSCFG &node = prevDottedRule.GetLastNode();

  const PhraseDictionaryNodeSCFG::NonTerminalMap & nonTermMap =
    node.GetNonTerminalMap();

  const size_t numChildren = nonTermMap.size();
  if (numChildren == 0) {
    return;
  }
  const size_t numSourceNonTerms = sourceNonTerms.size();
  const size_t numTargetNonTerms = targetNonTerms.GetSize();
  const size_t numCombinations = numSourceNonTerms * numTargetNonTerms;

  // We can search by either:
  //   1. Enumerating all possible source-target NT pairs that are valid for
  //      the span and then searching for matching children in the node,
  // or
  //   2. Iterating over all the NT children in the node, searching
  //      for each source and target NT in the span's sets.
  // We'll do whichever minimises the number of lookups:
  if (numCombinations <= numChildren*2) {

		// loop over possible source non-terminal labels (as found in input tree)
    NonTerminalSet::const_iterator p = sourceNonTerms.begin();
    NonTerminalSet::const_iterator sEnd = sourceNonTerms.end();
    for (; p != sEnd; ++p) {
      const Word & sourceNonTerm = *p;

      // loop over possible target non-terminal labels (as found in chart)
      ChartCellLabelSet::const_iterator q = targetNonTerms.begin();
      ChartCellLabelSet::const_iterator tEnd = targetNonTerms.end();
      for (; q != tEnd; ++q) {
        const ChartCellLabel &cellLabel = q->second;

        // try to match both source and target non-terminal
        const PhraseDictionaryNodeSCFG * child =
          node.GetChild(sourceNonTerm, cellLabel.GetLabel());

        // nothing found? then we are done
        if (child == NULL) {
          continue;
        }

        // create new rule
#ifdef USE_BOOST_POOL
        DottedRuleInMemory *rule = m_dottedRulePool.malloc();
        new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule);
#else
        DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel,
                                                          prevDottedRule);
#endif
        dottedRuleColl.Add(stackInd, rule);
      }
    }
  } 
  else 
  {
    // loop over possible expansions of the rule
    PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p;
    PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end =
      nonTermMap.end();
    for (p = nonTermMap.begin(); p != end; ++p) {
      // does it match possible source and target non-terminals?
      const PhraseDictionaryNodeSCFG::NonTerminalMapKey &key = p->first;
      const Word &sourceNonTerm = key.first;
      if (sourceNonTerms.find(sourceNonTerm) == sourceNonTerms.end()) {
        continue;
      }
      const Word &targetNonTerm = key.second;
      const ChartCellLabel *cellLabel = targetNonTerms.Find(targetNonTerm);
      if (!cellLabel) {
        continue;
      }

      // create new rule
      const PhraseDictionaryNodeSCFG &child = p->second;
#ifdef USE_BOOST_POOL
      DottedRuleInMemory *rule = m_dottedRulePool.malloc();
      new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule);
#else
      DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel,
                                                        prevDottedRule);
#endif
      dottedRuleColl.Add(stackInd, rule);
    }
  }
}
void SparseHieroReorderingFeature::EvaluateChart(
  const ChartHypothesis&  cur_hypo ,
  ScoreComponentCollection* accumulator) const
{
  // get index map for underlying hypotheses
  //const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
  //  cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap();
  
  //The Huck features. For a rule with source side:
  //   abXcdXef
  //We first have to split into blocks:
  // ab X cd X ef
  //Then we extract features based in the boundary words of the neighbouring blocks
  //For the block pair, we use the right word of the left block, and the left 
  //word of the right block.

  //Need to get blocks, and their alignment. Each block has a word range (on the 
  // on the source), a non-terminal flag, and  a set of alignment points in the target phrase

  //We need to be able to map source word position to target word position, as
  //much as possible (don't need interior of non-terminals). The alignment info
  //objects just give us the mappings between *rule* positions. So if we can 
  //map source word position to source rule position, and target rule position
  //to target word position, then we can map right through.

  size_t sourceStart = cur_hypo.GetCurrSourceRange().GetStartPos();
  size_t sourceSize = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();

  vector<WordsRange> sourceNTSpans;
  for (size_t prevHypoId = 0; prevHypoId < cur_hypo.GetPrevHypos().size(); ++prevHypoId) {
    sourceNTSpans.push_back(cur_hypo.GetPrevHypo(prevHypoId)->GetCurrSourceRange());
  }
  //put in source order. Is this necessary?
  sort(sourceNTSpans.begin(), sourceNTSpans.end()); 
  //cerr << "Source NTs: ";
  //for (size_t i = 0; i < sourceNTSpans.size(); ++i) cerr << sourceNTSpans[i] << " ";
  //cerr << endl;

  typedef pair<WordsRange,bool> Block;//flag indicates NT
  vector<Block> sourceBlocks; 
  sourceBlocks.push_back(Block(cur_hypo.GetCurrSourceRange(),false));
  for (vector<WordsRange>::const_iterator i = sourceNTSpans.begin(); 
      i != sourceNTSpans.end(); ++i) {
    const WordsRange& prevHypoRange = *i;
    Block lastBlock = sourceBlocks.back();
    sourceBlocks.pop_back();
    //split this range into before NT, NT and after NT
    if (prevHypoRange.GetStartPos() > lastBlock.first.GetStartPos()) {
      sourceBlocks.push_back(Block(WordsRange(lastBlock.first.GetStartPos(),prevHypoRange.GetStartPos()-1),false));
    }
    sourceBlocks.push_back(Block(prevHypoRange,true));
    if (prevHypoRange.GetEndPos() < lastBlock.first.GetEndPos()) {
      sourceBlocks.push_back(Block(WordsRange(prevHypoRange.GetEndPos()+1,lastBlock.first.GetEndPos()), false));
    }
  }
  /*
  cerr << "Source Blocks: ";
  for (size_t i = 0; i < sourceBlocks.size(); ++i) cerr << sourceBlocks[i].first << " "
      << (sourceBlocks[i].second ? "NT" : "T") << " ";
  cerr << endl;
  */

  //Mapping from source word to target rule position
  vector<size_t> sourceWordToTargetRulePos(sourceSize);
  map<size_t,size_t> alignMap;
  alignMap.insert(
    cur_hypo.GetCurrTargetPhrase().GetAlignTerm().begin(),
    cur_hypo.GetCurrTargetPhrase().GetAlignTerm().end());
  alignMap.insert(
    cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().begin(),
    cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().end());
  //vector<size_t> alignMapTerm = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm()
  size_t sourceRulePos = 0;
  //cerr << "SW->RP ";
  for (vector<Block>::const_iterator sourceBlockIt = sourceBlocks.begin(); 
    sourceBlockIt != sourceBlocks.end(); ++sourceBlockIt) {
    for (size_t sourceWordPos = sourceBlockIt->first.GetStartPos();
      sourceWordPos <= sourceBlockIt->first.GetEndPos(); ++sourceWordPos) {
      sourceWordToTargetRulePos[sourceWordPos - sourceStart] = alignMap[sourceRulePos];
   //   cerr << sourceWordPos - sourceStart << "-" << alignMap[sourceRulePos] << " ";
      if (! sourceBlockIt->second) {
        //T
        ++sourceRulePos;
      }
    }
    if ( sourceBlockIt->second) {
      //NT
      ++sourceRulePos;
    }
  }
  //cerr << endl;

  //Iterate through block pairs
  const Sentence& sentence = 
    dynamic_cast<const Sentence&>(cur_hypo.GetManager().GetSource());
  //const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase();
  for (size_t i = 0; i < sourceBlocks.size()-1; ++i) {
    Block& leftSourceBlock = sourceBlocks[i];
    Block& rightSourceBlock = sourceBlocks[i+1];
    size_t sourceLeftBoundaryPos = leftSourceBlock.first.GetEndPos();
    size_t sourceRightBoundaryPos = rightSourceBlock.first.GetStartPos();
    const Word& sourceLeftBoundaryWord = sentence.GetWord(sourceLeftBoundaryPos);
    const Word& sourceRightBoundaryWord = sentence.GetWord(sourceRightBoundaryPos);
    sourceLeftBoundaryPos -= sourceStart;
    sourceRightBoundaryPos -= sourceStart;
    
    // Need to figure out where these map to on the target.
    size_t targetLeftRulePos = 
      sourceWordToTargetRulePos[sourceLeftBoundaryPos];
    size_t targetRightRulePos = 
      sourceWordToTargetRulePos[sourceRightBoundaryPos];

    bool isMonotone = true;
    if ((sourceLeftBoundaryPos < sourceRightBoundaryPos  &&
          targetLeftRulePos > targetRightRulePos) ||
      ((sourceLeftBoundaryPos > sourceRightBoundaryPos  &&
          targetLeftRulePos < targetRightRulePos)))
    {
      isMonotone = false;
    }
    stringstream buf;
    buf << "h_"; //sparse reordering, Huck
    if (m_type == SourceLeft || m_type == SourceCombined) {
      buf << GetFactor(sourceLeftBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
      buf << "_";
    }
    if (m_type == SourceRight || m_type == SourceCombined) {
    buf << GetFactor(sourceRightBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString();
      buf << "_";
    }
    buf << (isMonotone ? "M" : "S");
    accumulator->PlusEquals(this,buf.str(), 1);
  }
//  cerr << endl;
}
void ChartRuleLookupManagerMemory::GetChartRuleCollection(
    const WordsRange &range,
    bool adhereTableLimit,
    ChartTranslationOptionList &outColl)
{
  size_t relEndPos = range.GetEndPos() - range.GetStartPos();
  size_t absEndPos = range.GetEndPos();

	// MAIN LOOP. create list of nodes of target phrases

	ProcessedRuleColl &processedRuleCol = *m_processedRuleColls[range.GetStartPos()];
	const ProcessedRuleList &runningNodes = processedRuleCol.GetRunningNodes();
    // Note that runningNodes can be expanded as the loop runs (through calls to
    // ExtendPartialRuleApplication()).
	for (size_t ind = 0; ind < runningNodes.size(); ++ind)
	{
		const ProcessedRule &prevProcessedRule = *runningNodes[ind];
		const PhraseDictionaryNodeSCFG &prevNode = prevProcessedRule.GetLastNode();
		const WordConsumed *prevWordConsumed = prevProcessedRule.GetLastWordConsumed();
		size_t startPos = (prevWordConsumed == NULL) ? range.GetStartPos() : prevWordConsumed->GetWordsRange().GetEndPos() + 1;
		
		// search for terminal symbol
		if (startPos == absEndPos)
		{
			const Word &sourceWord = GetSentence().GetWord(absEndPos);
			const PhraseDictionaryNodeSCFG *node = prevNode.GetChild(sourceWord);
			if (node != NULL)
			{
				WordConsumed *newWordConsumed = new WordConsumed(absEndPos, absEndPos
																												 , sourceWord
																												 , prevWordConsumed);
				ProcessedRule *processedRule = new ProcessedRule(*node, newWordConsumed);
				processedRuleCol.Add(relEndPos+1, processedRule);
			}
		}
		
		// search for non-terminals
		size_t endPos, stackInd;
		if (startPos > absEndPos)
			continue;
		else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos())
		{ // start.
			endPos = absEndPos - 1;
			stackInd = relEndPos;
		}
		else
		{
			endPos = absEndPos;
			stackInd = relEndPos + 1;
		}
		
		const NonTerminalSet &sourceNonTerms =
            GetSentence().GetLabelSet(startPos, endPos);

        const NonTerminalSet &targetNonTerms =
            GetCellCollection().GetHeadwords(WordsRange(startPos, endPos));

        ExtendPartialRuleApplication(prevNode, prevWordConsumed, startPos,
                                     endPos, stackInd, sourceNonTerms,
                                     targetNonTerms, processedRuleCol);
	}
	
	// return list of target phrases
	ProcessedRuleList &nodes = processedRuleCol.Get(relEndPos + 1);
	
	size_t rulesLimit = StaticData::Instance().GetRuleLimit();
	ProcessedRuleList::const_iterator iterNode;
	for (iterNode = nodes.begin(); iterNode != nodes.end(); ++iterNode)
	{
		const ProcessedRule &processedRule = **iterNode;
		const PhraseDictionaryNodeSCFG &node = processedRule.GetLastNode();
		const WordConsumed *wordConsumed = processedRule.GetLastWordConsumed();
		assert(wordConsumed);
		
		const TargetPhraseCollection *targetPhraseCollection = node.GetTargetPhraseCollection();
		
		if (targetPhraseCollection != NULL)
		{
			outColl.Add(*targetPhraseCollection, *wordConsumed, adhereTableLimit, rulesLimit);
		}
	}
	outColl.CreateChartRules(rulesLimit);	
}
Example #21
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 */
bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions)
{
  //parse XML markup in translation line

  // no xml tag? we're done.
  if (line.find_first_of('<') == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  // keep this handy for later
  const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
  const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        if (startPos >= endPos) {
          TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
          return false;
        }

        // may be either a input span label ("label"), or a specified output translation "translation"
        string label = ParseXmlTagAttribute(tagContent,"label");
        string translation = ParseXmlTagAttribute(tagContent,"translation");

        // specified label
        if (translation.length() == 0 && label.length() > 0) {
          WordsRange range(startPos,endPos-1); // really?
          XMLParseOutput item(label, range);
          sourceLabels.push_back(item);
        }

        // specified translations -> vector of phrases, separated by "||"
        if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) {
          vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
          vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
          //TRACE_ERR("number of translations: " << altTexts.size() << endl);
          for (size_t i=0; i<altTexts.size(); ++i) {
            // set target phrase
            TargetPhrase targetPhrase;
            targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);

            // set constituent label
            string targetLHSstr;
            if (altLabel.size() > i && altLabel[i].size() > 0) {
              targetLHSstr = altLabel[i];
            } else {
              const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
              UnknownLHSList::const_iterator iterLHS = lhsList.begin();
              targetLHSstr = iterLHS->first;
            }
            Word *targetLHS = new Word(true);
            targetLHS->CreateFromString(Output, outputFactorOrder, targetLHSstr, true);
            CHECK(targetLHS->GetFactor(0) != NULL);
            targetPhrase.SetTargetLHS(targetLHS);

            // not tested
            Phrase sourcePhrase = this->GetSubString(WordsRange(startPos,endPos-1));

            // get probability
            float probValue = 1;
            if (altProbs.size() > i && altProbs[i].size() > 0) {
              probValue = Scan<float>(altProbs[i]);
            }
            // convert from prob to log-prob
            float scoreValue = FloorScore(TransformScore(probValue));
            targetPhrase.SetXMLScore(scoreValue);
            targetPhrase.Evaluate(sourcePhrase);

            // set span and create XmlOption
            WordsRange range(startPos+1,endPos);
            XmlOption *option = new XmlOption(range,targetPhrase);
            CHECK(option);
            xmlOptions.push_back(option);

            VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl);
          }
          altTexts.clear();
          altProbs.clear();
        }
      }
    }
  }
  // we are done. check if there are tags that are still open
  if (tagStack.size() > 0) {
    TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
    return false;
  }

  // return de-xml'ed sentence in line
  line = cleanLine;
  return true;
}
Example #22
0
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
  TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
  const Phrase &sourcePhrase, bool topLevel)
{
  
  bool extending = tpv->size();
  size_t bitsLeft = encodedBitStream.TellFromEnd();
    
  typedef std::pair<size_t, size_t> AlignPointSizeT;
  
  std::vector<int> sourceWords;
  if(m_coding == REnc)
  {
    for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
    {
      std::string sourceWord
        = sourcePhrase.GetWord(i).GetString(*m_input, false);
      unsigned idx = GetSourceSymbolId(sourceWord);
      sourceWords.push_back(idx);
    }
  }
  
  unsigned phraseStopSymbol = 0;
  AlignPoint alignStopSymbol(-1, -1);
  
  std::vector<float> scores;
  std::set<AlignPointSizeT> alignment;
  
  enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
  
  size_t srcSize = sourcePhrase.GetSize();
  
  TargetPhrase* targetPhrase = NULL;
  while(encodedBitStream.TellFromEnd())
  {
     
    if(state == New)
    {
      // Creating new TargetPhrase on the heap
      tpv->push_back(TargetPhrase(Output));
      targetPhrase = &tpv->back();
      
      targetPhrase->SetSourcePhrase(sourcePhrase);
      alignment.clear();
      scores.clear();
        
      state = Symbol;
    }
    
    if(state == Symbol)
    {
      unsigned symbol = m_symbolTree->Read(encodedBitStream);      
      if(symbol == phraseStopSymbol)
      {
        state = Score;
      }
      else
      {
        if(m_coding == REnc)
        {
          std::string wordString;
          size_t type = GetREncType(symbol);
          
          if(type == 1)
          {
            unsigned decodedSymbol = DecodeREncSymbol1(symbol);
            wordString = GetTargetSymbol(decodedSymbol);
          }
          else if (type == 2)
          {
            size_t rank = DecodeREncSymbol2Rank(symbol);
            size_t srcPos = DecodeREncSymbol2Position(symbol);
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = targetPhrase->GetSize();
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          else if(type == 3)
          {
            size_t rank = DecodeREncSymbol3(symbol);
            size_t srcPos = targetPhrase->GetSize();
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
                            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));   
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = srcPos;
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          
          Word word;
          word.CreateFromString(Output, *m_output, wordString, false);
          targetPhrase->AddWord(word);
        }
        else if(m_coding == PREnc)
        {
          // if the symbol is just a word
          if(GetPREncType(symbol) == 1)
          {
            unsigned decodedSymbol = DecodePREncSymbol1(symbol);
     
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(decodedSymbol), false);
            targetPhrase->AddWord(word);
          }
          // if the symbol is a subphrase pointer
          else
          {
            int left = DecodePREncSymbol2Left(symbol);
            int right = DecodePREncSymbol2Right(symbol);
            unsigned rank = DecodePREncSymbol2Rank(symbol);
            
            int srcStart = left + targetPhrase->GetSize();
            int srcEnd   = srcSize - right - 1;
            
            // false positive consistency check
            if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
              return TargetPhraseVectorPtr();
            
            // false positive consistency check
            if(m_maxRank && rank > m_maxRank)
                return TargetPhraseVectorPtr();
            
            // set subphrase by default to itself
            TargetPhraseVectorPtr subTpv = tpv;
            
            // if range smaller than source phrase retrieve subphrase
            if(unsigned(srcEnd - srcStart + 1) != srcSize)
            {
              Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
              subTpv = CreateTargetPhraseCollection(subPhrase, false);
            }
            
            // false positive consistency check
            if(subTpv != NULL && rank < subTpv->size())
            {
              // insert the subphrase into the main target phrase
              TargetPhrase& subTp = subTpv->at(rank);
              if(m_phraseDictionary.m_useAlignmentInfo)
              {
                // reconstruct the alignment data based on the alignment of the subphrase
                for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin();
                    it != subTp.GetAlignmentInfo().end(); it++)
                {
                  alignment.insert(AlignPointSizeT(srcStart + it->first,
                                                   targetPhrase->GetSize() + it->second));
                }
              }
              targetPhrase->Append(subTp);
            }
            else 
              return TargetPhraseVectorPtr();
          }
        }
        else
        {
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(symbol), false);
            targetPhrase->AddWord(word);
        }
      }
    }
    else if(state == Score)
    {
      size_t idx = m_multipleScoreTrees ? scores.size() : 0;
      float score = m_scoreTrees[idx]->Read(encodedBitStream);
      scores.push_back(score);
      
      if(scores.size() == m_numScoreComponent)
      {
        targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels);
        
        if(m_containsAlignmentInfo)
          state = Alignment;
        else
          state = Add;
      }
    }
    else if(state == Alignment)
    {
      AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
      if(alignPoint == alignStopSymbol)
      {
        state = Add;
      }
      else
      {
        if(m_phraseDictionary.m_useAlignmentInfo)  
          alignment.insert(AlignPointSizeT(alignPoint));
      }
    }
    
    if(state == Add)
    {
      if(m_phraseDictionary.m_useAlignmentInfo)
        targetPhrase->SetAlignmentInfo(alignment);
      
      if(m_coding == PREnc)
      {
        if(!m_maxRank || tpv->size() <= m_maxRank)
          bitsLeft = encodedBitStream.TellFromEnd();
        
        if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
          break;
      }
      
      if(encodedBitStream.TellFromEnd() <= 8)
        break;
      
      state = New;
    }    
  }
  
  if(m_coding == PREnc && !extending)
  {
    bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
    m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
  }
  
  return tpv;
}
FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureID,
        ScoreComponentCollection* accumulator ) const
{
    if (!m_enabled) return new BleuScoreState();

    NGrams::const_iterator reference_ngrams_iter;

    const Phrase& curr_target_phrase = static_cast<const Phrase&>(cur_hypo.GetCurrTargetPhrase());
//  cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl;

    // Calculate old bleu of previous states
    float old_bleu = 0, new_bleu = 0;
    size_t num_old_words = 0, num_words_first_prev = 0;
    size_t num_words_added_left = 0, num_words_added_right = 0;

    // double-check cases where more than two previous hypotheses were combined
    assert(cur_hypo.GetPrevHypos().size() <= 2);
    BleuScoreState* new_state;
    if (cur_hypo.GetPrevHypos().size() == 0)
        new_state = new BleuScoreState();
    else {
        const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID);
        const BleuScoreState& ps_zero = dynamic_cast<const BleuScoreState&>(*prev_state_zero);
        new_state = new BleuScoreState(ps_zero);
        num_words_first_prev = ps_zero.m_target_length;

        for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) {
            const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID);
            const BleuScoreState* ps = dynamic_cast<const BleuScoreState*>(prev_state);
            BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps);
//  		cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase()
//  				<< " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl;

            old_bleu += CalculateBleu(ps_nonConst);
            num_old_words += ps->m_target_length;

            if (i > 0)
                // add ngram matches from other previous states
                new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches);
        }
    }

    // check if we are already done (don't add <s> and </s>)
    size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
    if (numWordsCovered == m_cur_source_length) {
        // Bleu score stays the same, do not need to add anything
        //accumulator->PlusEquals(this, 0);
        return new_state;
    }

    // set new context
    Phrase new_words = cur_hypo.GetOutputPhrase();
    new_state->m_words = new_words;
    size_t num_curr_words = new_words.GetSize();

    // get ngram matches for new words
    if (num_old_words == 0) {
//  	cerr << "compute right ngram context" << endl;
        GetNgramMatchCounts(new_words,
                            m_cur_ref_ngrams,
                            new_state->m_ngram_counts,
                            new_state->m_ngram_matches,
                            0);
    } else if (new_words.GetSize() == num_old_words) {
        // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis
        num_words_added_right = num_curr_words - num_words_first_prev;
        // score around overlap point
//  	cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl;
        GetNgramMatchCounts_overlap(new_words,
                                    m_cur_ref_ngrams,
                                    new_state->m_ngram_counts,
                                    new_state->m_ngram_matches,
                                    num_words_first_prev);
    } else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) {
        assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1);
        // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts)
        for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i)
            if (curr_target_phrase.GetWord(i).IsNonTerminal()) {
                num_words_added_left = i;
                num_words_added_right = curr_target_phrase.GetSize() - (i+1);
                break;
            }

        // left context
//  	cerr << "compute left ngram context" << endl;
        if (num_words_added_left > 0)
            GetNgramMatchCounts_prefix(new_words,
                                       m_cur_ref_ngrams,
                                       new_state->m_ngram_counts,
                                       new_state->m_ngram_matches,
                                       num_words_added_left,
                                       num_curr_words - num_words_added_right - 1);

        // right context
//  	cerr << "compute right ngram context" << endl;
        if (num_words_added_right > 0)
            GetNgramMatchCounts(new_words,
                                m_cur_ref_ngrams,
                                new_state->m_ngram_counts,
                                new_state->m_ngram_matches,
                                num_words_added_left + num_old_words);
    } else {
        cerr << "undefined state.. " << endl;
        exit(1);
    }

    // Update state variables
    size_t ctx_start_idx = 0;
    size_t ctx_end_idx = new_words.GetSize()-1;
    size_t bleu_context_length = BleuScoreState::bleu_order -1;
    if (ctx_end_idx > bleu_context_length) {
        ctx_start_idx = ctx_end_idx - bleu_context_length;
    }

    new_state->m_source_length = cur_hypo.GetCurrSourceRange().GetNumWordsCovered();
    new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx, ctx_end_idx));
    new_state->m_target_length = cur_hypo.GetOutputPhrase().GetSize();

    // we need a scaled reference length to compare the current target phrase to the corresponding
    // reference phrase
    size_t cur_source_length = m_cur_source_length;
    new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length);

    // Calculate new bleu.
    new_bleu = CalculateBleu(new_state);

    // Set score to new Bleu score
    accumulator->PlusEquals(this, new_bleu - old_bleu);
    return new_state;
}
Example #24
0
// Create the InputTree::Node objects but do not connect them.
void InputTreeBuilder::CreateNodes(const TreeInput &in,
                                   const std::string &topLevelLabel,
                                   InputTree &out)
{
  // Get the input sentence word count.  This includes the <s> and </s> symbols.
  const std::size_t numWords = in.GetSize();

  // Get the parse tree non-terminal nodes.  The parse tree covers the original
  // sentence only, not the <s> and </s> symbols, so at this point there is
  // no top-level node.
  std::vector<XMLParseOutput> xmlNodes = in.GetLabelledSpans();

  // Sort the XML nodes into post-order.  Prior to sorting they will be in the
  // order that TreeInput created them.  Usually that will be post-order, but
  // if, for example, the tree was binarized by relax-parse then it won't be.
  // In all cases, we assume that if two nodes cover the same span then the
  // first one is the lowest.
  SortXmlNodesIntoPostOrder(xmlNodes);

  // Copy the parse tree non-terminal nodes, but offset the ranges by 1 (to
  // allow for the <s> symbol at position 0).
  std::vector<XMLParseOutput> nonTerms;
  nonTerms.reserve(xmlNodes.size()+1);
  for (std::vector<XMLParseOutput>::const_iterator p = xmlNodes.begin();
       p != xmlNodes.end(); ++p) {
    std::size_t start = p->m_range.GetStartPos();
    std::size_t end = p->m_range.GetEndPos();
    nonTerms.push_back(XMLParseOutput(p->m_label, WordsRange(start+1, end+1)));
  }
  // Add a top-level node that also covers <s> and </s>.
  nonTerms.push_back(XMLParseOutput(topLevelLabel, WordsRange(0, numWords-1)));

  // Allocate space for the InputTree nodes.  In the case of out.nodes, this
  // step is essential because once created the PVertex objects must not be
  // moved around (through vector resizing) because InputTree keeps pointers
  // to them.
  out.nodes.reserve(numWords + nonTerms.size());
  out.nodesAtPos.resize(numWords);

  // Create the InputTree::Node objects.
  int prevStart = -1;
  int prevEnd = -1;
  for (std::vector<XMLParseOutput>::const_iterator p = nonTerms.begin();
       p != nonTerms.end(); ++p) {
    int start = static_cast<int>(p->m_range.GetStartPos());
    int end = static_cast<int>(p->m_range.GetEndPos());

    // Check if we've started ascending a new subtree.
    if (start != prevStart && end != prevEnd) {
      // Add a node for each terminal to the left of or below the first
      // nonTerm child of the subtree.
      for (int i = prevEnd+1; i <= end; ++i) {
        PVertex v(WordsRange(i, i), in.GetWord(i));
        out.nodes.push_back(InputTree::Node(v));
        out.nodesAtPos[i].push_back(&out.nodes.back());
      }
    }
    // Add a node for the non-terminal.
    Word w(true);
    w.CreateFromString(Moses::Output, m_outputFactorOrder, p->m_label, true);
    PVertex v(WordsRange(start, end), w);
    out.nodes.push_back(InputTree::Node(v));
    out.nodesAtPos[start].push_back(&out.nodes.back());

    prevStart = start;
    prevEnd = end;
  }
}