Exemplo n.º 1
0
double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, bool is_input) const
{
  // lexical translation probability

  double lexScore = 1.0;
  Word null;
  if (is_input) {
    null.CreateFromString(Input, m_input, "NULL", false);
  } else {
    null.CreateFromString(Output, m_output, "NULL", false);
  }

  // all target words have to be explained
  for(size_t ti=0; ti<alignment.size(); ti++) {
    const set< size_t > & srcIndices = alignment[ ti ];
    Word t_word = phraseT.GetWord(ti);

    if (srcIndices.empty()) {
      // explain unaligned word by NULL
      lexScore *= GetLexicalProbability( null, t_word, tables, multimodelweights );
    } else {
      // go through all the aligned words to compute average
      double thisWordScore = 0;
      for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) {
        Word s_word = phraseS.GetWord(*si);
        thisWordScore += GetLexicalProbability( s_word, t_word, tables, multimodelweights );
      }
      lexScore *= thisWordScore / srcIndices.size();
    }
  }
  return lexScore;
}
IPhrase LexicalReorderingTableTree::MakeTableKey(const Phrase& f,
    const Phrase& e) const
{
  IPhrase key;
  std::vector<std::string> keyPart;
  if(!m_FactorsF.empty()) {
    for(size_t i = 0; i < f.GetSize(); ++i) {
      /* old code
        std::string s = f.GetWord(i).ToString(m_FactorsF);
        keyPart.push_back(s.substr(0,s.size()-1));
        */
      keyPart.push_back(f.GetWord(i).GetString(m_FactorsF, false));
    }
    auxAppend(key, m_Table->ConvertPhrase(keyPart, SourceVocId));
    keyPart.clear();
  }
  if(!m_FactorsE.empty()) {
    if(!key.empty()) {
      key.push_back(PrefixTreeMap::MagicWord);
    }
    for(size_t i = 0; i < e.GetSize(); ++i) {
      /* old code
        std::string s = e.GetWord(i).ToString(m_FactorsE);
        keyPart.push_back(s.substr(0,s.size()-1));
        */
      keyPart.push_back(e.GetWord(i).GetString(m_FactorsE, false));
    }
    auxAppend(key, m_Table->ConvertPhrase(keyPart,TargetVocId));
    //keyPart.clear();
  }
  return key;
};
Exemplo n.º 3
0
/**
 * Pre-calculate the n-gram probabilities for the words in the specified phrase.
 *
 * Note that when this method is called, we do not have access to the context
 * in which this phrase will eventually be applied.
 *
 * In other words, we know what words are in this phrase,
 * but we do not know what words will come before or after this phrase.
 *
 * The parameters fullScore, ngramScore, and oovCount are all output parameters.
 *
 * The value stored in oovCount is the number of words in the phrase
 * that are not in the language model's vocabulary.
 *
 * The sum of the ngram scores for all words in this phrase are stored in fullScore.
 *
 * The value stored in ngramScore is similar, but only full-order ngram scores are included.
 *
 * This is best shown by example:
 *
 * Assume a trigram backward language model and a phrase "a b c d e f g"
 *
 * fullScore would represent the sum of the logprob scores for the following values:
 *
 * p(g)
 * p(f | g)
 * p(e | g f)
 * p(d | f e)
 * p(c | e d)
 * p(b | d c)
 * p(a | c b)
 *
 * ngramScore would represent the sum of the logprob scores for the following values:
 *
 * p(g)
 * p(f | g)
 * p(e | g f)
 * p(d | f e)
 * p(c | e d)
 * p(b | d c)
 * p(a | c b)
 */
template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if (!phrase.GetSize()) return;

  lm::ngram::ChartState discarded_sadly;
  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);

  UTIL_THROW_IF(
    (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
    util::Exception,
    "BackwardLanguageModel does not currently support rules that include <s>"
  );

  float before_boundary = 0.0f;

  int lastWord = phrase.GetSize() - 1;
  int ngramBoundary = m_ngram->Order() - 1;
  int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;

  int position;
  for (position = lastWord; position >= 0; position-=1) {
    const Word &word = phrase.GetWord(position);
    UTIL_THROW_IF(
      (word.IsNonTerminal()),
      util::Exception,
      "BackwardLanguageModel does not currently support rules that include non-terminals "
    );

    lm::WordIndex index = TranslateID(word);
    scorer.Terminal(index);
    if (!index) ++oovCount;

    if (position==boundary) {
      before_boundary = scorer.Finish();
    }

  }

  fullScore = scorer.Finish();

  ngramScore = TransformLMScore(fullScore - before_boundary);
  fullScore = TransformLMScore(fullScore);

}
Exemplo n.º 4
0
size_t Phrase::Find(const Phrase &sought, int maxUnknown) const
{
  if (GetSize() < sought.GetSize()) {
    // sought phrase too big
    return NOT_FOUND;
  }

  size_t maxStartPos = GetSize() - sought.GetSize();
  for (size_t startThisPos = 0; startThisPos <= maxStartPos; ++startThisPos) {
    size_t thisPos = startThisPos;
    int currUnknowns = 0;
    size_t soughtPos;
    for (soughtPos = 0; soughtPos < sought.GetSize(); ++soughtPos) {
      const Word &soughtWord = sought.GetWord(soughtPos);
      const Word &thisWord = GetWord(thisPos);

      if (soughtWord == thisWord) {
        ++thisPos;
      } else if (soughtWord.IsOOV() && (maxUnknown < 0 || currUnknowns < maxUnknown)) {
        // the output has an OOV word. Allow a certain number of OOVs
        ++currUnknowns;
        ++thisPos;
      } else {
        break;
      }
    }

    if (soughtPos == sought.GetSize()) {
      return startThisPos;
    }
  }

  return NOT_FOUND;
}
void SourceWordDeletionFeature::ComputeFeatures(const Phrase &source,
    const TargetPhrase& targetPhrase,
    ScoreComponentCollection* accumulator,
    const AlignmentInfo &alignmentInfo) const
{
  // handle special case: unknown words (they have no word alignment)
  size_t targetLength = targetPhrase.GetSize();
  size_t sourceLength = source.GetSize();
  if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;

  // flag aligned words
  bool aligned[16];
  CHECK(sourceLength < 16);
  for(size_t i=0; i<sourceLength; i++)
    aligned[i] = false;
  for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++)
    aligned[ alignmentPoint->first ] = true;

  // process unaligned source words
  for(size_t i=0; i<sourceLength; i++) {
    if (!aligned[i]) {
      const Word &w = source.GetWord(i);
      if (!w.IsNonTerminal()) {
        const StringPiece word = w.GetFactor(m_factorType)->GetString();
        if (word != "<s>" && word != "</s>") {
          if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) {
            accumulator->PlusEquals(this, StringPiece("OTHER"),1);
          } else {
            accumulator->PlusEquals(this,word,1);
          }
        }
      }
    }
  }
}
int Phrase::Compare(const Phrase &compare) const
{
  int ret = 0;
  for (size_t pos = 0; pos < GetSize(); ++pos) {
    if (pos >= compare.GetSize()) {
      // we're bigger than the other. Put 1st
      ret = -1;
      break;
    }

    const Word &thisWord = GetWord(pos)
                           ,&compareWord = compare.GetWord(pos);
    int wordRet = thisWord.Compare(compareWord);
    if (wordRet != 0) {
      ret = wordRet;
      break;
    }
  }

  if (ret == 0) {
    CHECK(compare.GetSize() >= GetSize());
    ret = (compare.GetSize() > GetSize()) ? 1 : 0;
  }
  return ret;
}
Scores
LexicalReorderingTableTree::
auxFindScoreForContext(const Candidates& cands, const Phrase& context)
{
  if(m_FactorsC.empty()) {
    UTIL_THROW_IF2(cands.size() > 1, "Error");
    return (cands.size() == 1) ? cands[0].GetScore(0) : Scores();
  } else {
    std::vector<std::string> cvec;
    for(size_t i = 0; i < context.GetSize(); ++i)
      cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));

    IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
    IPhrase sub_c;
    IPhrase::iterator start = c.begin();
    for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
      sub_c.assign(start, c.end());
      for(size_t cand = 0; cand < cands.size(); ++cand) {
        IPhrase p = cands[cand].GetPhrase(0);
        if(cands[cand].GetPhrase(0) == sub_c)
          return cands[cand].GetScore(0);
      }
    }
    return Scores();
  }
}
Exemplo n.º 8
0
void RuleScope::EvaluateInIsolation(const Phrase &source
						, const TargetPhrase &targetPhrase
						, ScoreComponentCollection &scoreBreakdown
						, ScoreComponentCollection &estimatedFutureScore) const
{
  // adjacent non-term count as 1 ammbiguity, rather than 2 as in rule scope
  // source can't be empty, right?
  float score = 0;

  int count = 0;
  for (size_t i = 0; i < source.GetSize() - 0; ++i) {
	const Word &word = source.GetWord(i);
	bool ambiguous = IsAmbiguous(word, m_sourceSyntax);
	if (ambiguous) {
		++count;
	}
	else {
		if (count > 0) {
			score += count;
		}
		count = -1;
	}
  }

  // 1st & last always adjacent to ambiguity
  ++count;
  if (count > 0) {
	score += count;
  }

  scoreBreakdown.PlusEquals(this, score);
}
Scores LexicalReorderingTableTree::auxFindScoreForContext(const Candidates& cands, const Phrase& context)
{
  if(m_FactorsC.empty()) {
    CHECK(cands.size() <= 1);
    return (1 == cands.size())?(cands[0].GetScore(0)):(Scores());
  } else {
    std::vector<std::string> cvec;
    for(size_t i = 0; i < context.GetSize(); ++i) {
      /* old code
        std::string s = context.GetWord(i).ToString(m_FactorsC);
      cvec.push_back(s.substr(0,s.size()-1));
        */
      cvec.push_back(context.GetWord(i).GetString(m_FactorsC, false));
    }
    IPhrase c = m_Table->ConvertPhrase(cvec,TargetVocId);
    IPhrase sub_c;
    IPhrase::iterator start = c.begin();
    for(size_t j = 0; j <= context.GetSize(); ++j, ++start) {
      sub_c.assign(start, c.end());
      for(size_t cand = 0; cand < cands.size(); ++cand) {
        IPhrase p = cands[cand].GetPhrase(0);
        if(cands[cand].GetPhrase(0) == sub_c) {
          return cands[cand].GetScore(0);
        }
      }
    }
    return Scores();
  }
}
Exemplo n.º 10
0
void Phrase::Append(const Phrase &endPhrase)
{

  for (size_t i = 0; i < endPhrase.GetSize(); i++) {
    AddWord(endPhrase.GetWord(i));
  }
}
Exemplo n.º 11
0
lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, bool is_input )
{
//do all the necessary lexical table lookups and get counts, but don't apply weights yet

  Word null;
  if (is_input) {
    null.CreateFromString(Input, m_input, "NULL", false);
  } else {
    null.CreateFromString(Output, m_output, "NULL", false);
  }

  lexicalCache ret;

  // all target words have to be explained
  for(size_t ti=0; ti<alignment.size(); ti++) {
    const set< size_t > & srcIndices = alignment[ ti ];
    Word t_word = phraseT.GetWord(ti);

    vector<lexicalPair> ti_vector;
    if (srcIndices.empty()) {
      // explain unaligned word by NULL
      vector<float> joint_count (m_numModels);
      vector<float> marginals (m_numModels);

      FillLexicalCountsJoint(null, t_word, joint_count, tables);
      FillLexicalCountsMarginal(null, marginals, tables);

      ti_vector.push_back(make_pair(joint_count, marginals));

    } else {
      for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) {
        Word s_word = phraseS.GetWord(*si);
        vector<float> joint_count (m_numModels);
        vector<float> marginals (m_numModels);

        FillLexicalCountsJoint(s_word, t_word, joint_count, tables);
        FillLexicalCountsMarginal(s_word, marginals, tables);

        ti_vector.push_back(make_pair(joint_count, marginals));
      }
    }
    ret.push_back(ti_vector);
  }
  return ret;
}
Exemplo n.º 12
0
Phrase::Phrase(const Phrase &copy)
  :m_words(copy.GetSize())
{
  for (size_t pos = 0; pos < copy.GetSize(); ++pos) {
    const Word &oldWord = copy.GetWord(pos);
    Word *newWord = new Word(oldWord);
    m_words[pos] = newWord;
  }
}
Exemplo n.º 13
0
void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if ( !phrase.GetSize() ) return;

  int _min = min(m_lmtb_size - 1, (int) phrase.GetSize());

  int codes[m_lmtb_size];
  int idx = 0;
  codes[idx] = m_lmtb_sentenceStart;
  ++idx;
  int position = 0;

  char* msp = NULL;
  float before_boundary = 0.0;
  for (; position < _min; ++position) {
    codes[idx] = GetLmID(phrase.GetWord(position));
    if (codes[idx] == m_unknownId) ++oovCount;
    before_boundary += m_lmtb->clprob(codes,idx+1,NULL,NULL,&msp);
    ++idx;
  }

  ngramScore = 0.0;
  int end_loop = (int) phrase.GetSize();

  for (; position < end_loop; ++position) {
    for (idx = 1; idx < m_lmtb_size; ++idx) {
      codes[idx-1] = codes[idx];
    }
    codes[idx-1] = GetLmID(phrase.GetWord(position));
    if (codes[idx-1] == m_unknownId) ++oovCount;
    ngramScore += m_lmtb->clprob(codes,idx,NULL,NULL,&msp);
  }
  before_boundary = TransformLMScore(before_boundary);
  ngramScore = TransformLMScore(ngramScore);
  fullScore = ngramScore + before_boundary;
}
Exemplo n.º 14
0
bool LanguageModelMultiFactor::Useable(const Phrase &phrase) const
{
	if (phrase.GetSize()==0)
		return false;
	
	// whether phrase contains all factors in this LM
	const Word &word = phrase.GetWord(0);
	for (size_t currFactor = 0 ; currFactor < MAX_NUM_FACTORS ; ++currFactor)
	{
		if (m_factorTypes[currFactor] && word[currFactor] == NULL)
			return false;
	}
	return  true;

}
void LexicalReorderingTableTree::auxCacheForSrcPhrase(const Phrase& f)
{
  if(m_FactorsE.empty()) {
    //f is all of key...
    Candidates cands;
    m_Table->GetCandidates(MakeTableKey(f,Phrase(ARRAY_SIZE_INCR)),&cands);
    m_Cache[MakeCacheKey(f,Phrase(ARRAY_SIZE_INCR))] = cands;
  } else {
    ObjectPool<PPimp>     pool;
    PPimp* pPos  = m_Table->GetRoot();
    //1) goto subtree for f
    for(size_t i = 0; i < f.GetSize() && 0 != pPos && pPos->isValid(); ++i) {
      /* old code
      pPos = m_Table.Extend(pPos, auxClearString(f.GetWord(i).ToString(m_FactorsF)), SourceVocId);
      */
      pPos = m_Table->Extend(pPos, f.GetWord(i).GetString(m_FactorsF, false), SourceVocId);
    }
    if(0 != pPos && pPos->isValid()) {
      pPos = m_Table->Extend(pPos, PrefixTreeMap::MagicWord);
    }
    if(0 == pPos || !pPos->isValid()) {
      return;
    }
    //2) explore whole subtree depth first & cache
    std::string cache_key = auxClearString(f.GetStringRep(m_FactorsF)) + "|||";

    std::vector<State> stack;
    stack.push_back(State(pool.get(PPimp(pPos->ptr()->getPtr(pPos->idx),0,0)),""));
    Candidates cands;
    while(!stack.empty()) {
      if(stack.back().pos->isValid()) {
        LabelId w = stack.back().pos->ptr()->getKey(stack.back().pos->idx);
        std::string next_path = stack.back().path + " " + m_Table->ConvertWord(w,TargetVocId);
        //cache this
        m_Table->GetCandidates(*stack.back().pos,&cands);
        if(!cands.empty()) {
          m_Cache[cache_key + auxClearString(next_path)] = cands;
        }
        cands.clear();
        PPimp* next_pos = pool.get(PPimp(stack.back().pos->ptr()->getPtr(stack.back().pos->idx),0,0));
        ++stack.back().pos->idx;
        stack.push_back(State(next_pos,next_path));
      } else {
        stack.pop_back();
      }
    }
  }
}
void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source
        , const TargetPhrase &targetPhrase
        , ScoreComponentCollection &scoreBreakdown
        , ScoreComponentCollection &estimatedFutureScore) const
{
    const Factor* targetPhraseLHS = targetPhrase.GetTargetLHS()[0];
    if ( !m_glueRules && (targetPhraseLHS == m_glueTargetLHS) ) {
        return;
    }
    if ( !m_nonGlueRules && (targetPhraseLHS != m_glueTargetLHS) ) {
        return;
    }

    for (size_t posS=0; posS<source.GetSize(); ++posS) {
        const Word &wordS = source.GetWord(posS);
        if ( !wordS.IsNonTerminal() ) {
            return;
        }
    }

    ostringstream namestr;

    for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) {
        const Word &wordT = targetPhrase.GetWord(posT);
        const Factor* factorT = wordT[0];
        if ( wordT.IsNonTerminal() ) {
            namestr << "[";
        }
        namestr << factorT->GetString();
        if ( wordT.IsNonTerminal() ) {
            namestr << "]";
        }
        namestr << "|";
    }

    namestr << targetPhraseLHS->GetString() << "|";

    for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin();
            it!=targetPhrase.GetAlignNonTerm().end(); ++it) {
        namestr << "|" << it->first << "-" << it->second;
    }

    scoreBreakdown.PlusEquals(this, namestr.str(), 1);
    if ( targetPhraseLHS != m_glueTargetLHS ) {
        scoreBreakdown.PlusEquals(this, 1);
    }
}
//! set walls based on "-monotone-at-punctuation" flag
void ReorderingConstraint::SetMonotoneAtPunctuation( const Phrase &sentence )
{
  for( size_t i=0; i<sentence.GetSize(); i++ ) {
    const Word& word = sentence.GetWord(i);
    if (word[0]->GetString() == "," ||
        word[0]->GetString() == "." ||
        word[0]->GetString() == "!" ||
        word[0]->GetString() == "?" ||
        word[0]->GetString() == ":" ||
        word[0]->GetString() == ";" ||
        word[0]->GetString() == "\"") {
      // set wall before and after punc, but not at sentence start, end
      if (i>0 && i<m_size-1) SetWall( i, true );
      if (i>1)               SetWall( i-1, true );
    }
  }
}
bool
BilingualDynSuffixArray::
GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
{
  // looks up the SA vocab ids for the current src phrase
  size_t phraseSize = src.GetSize();
  for (size_t pos = 0; pos < phraseSize; ++pos) {
    const Word &word = src.GetWord(pos);
    wordID_t arrayId = m_srcVocab->GetWordID(word);
    if (arrayId == m_srcVocab->GetkOOVWordID()) {
      // oov
      return false;
    } else {
      output.SetId(pos, arrayId);
    }
  }
  return true;
}
Exemplo n.º 19
0
void OpSequenceModel:: Evaluate(const Phrase &source
                                , const TargetPhrase &targetPhrase
                                , ScoreComponentCollection &scoreBreakdown
                                , ScoreComponentCollection &estimatedFutureScore) const
{

  osmHypothesis obj;
  obj.setState(OSM->NullContextState());
  WordsBitmap myBitmap(source.GetSize());
  vector <string> mySourcePhrase;
  vector <string> myTargetPhrase;
  vector<float> scores(5);
  vector <int> alignments;
  int startIndex = 0;
  int endIndex = source.GetSize();

  const AlignmentInfo &align = targetPhrase.GetAlignTerm();
  AlignmentInfo::const_iterator iter;


  for (iter = align.begin(); iter != align.end(); ++iter) {
    alignments.push_back(iter->first);
    alignments.push_back(iter->second);
  }

  for (int i = 0; i < targetPhrase.GetSize(); i++) {
    if (targetPhrase.GetWord(i).IsOOV())
      myTargetPhrase.push_back("_TRANS_SLF_");
    else
      myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string());
  }

  for (int i = 0; i < source.GetSize(); i++) {
    mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string());
  }

  obj.setPhrases(mySourcePhrase , myTargetPhrase);
  obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize());
  obj.computeOSMFeature(startIndex,myBitmap);
  obj.calculateOSMProb(*OSM);
  obj.populateScores(scores);
  estimatedFutureScore.PlusEquals(this, scores);

}
Exemplo n.º 20
0
bool Phrase::operator== (const Phrase &other) const
{
  size_t thisSize = GetSize()
                    ,compareSize = other.GetSize();
  if (thisSize != compareSize) {
    return false;
  }

  for (size_t pos = 0 ; pos < thisSize ; pos++) {
    const Word &thisWord	= GetWord(pos)
                            ,&otherWord	= other.GetWord(pos);
    bool ret = thisWord == otherWord;
    if (!ret) {
      return false;
    }
  }

  return true;
}
Exemplo n.º 21
0
void CountNonTerms::Evaluate(const Phrase &sourcePhrase
              , const TargetPhrase &targetPhrase
              , ScoreComponentCollection &scoreBreakdown
              , ScoreComponentCollection &estimatedFutureScore) const
{
  const StaticData &staticData = StaticData::Instance();

  vector<float> scores(m_numScoreComponents, 0);
  size_t indScore = 0;

  if (m_all) {
	  for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
		const Word &word = targetPhrase.GetWord(i);
		if (word.IsNonTerminal()) {
			++scores[indScore];
		}
	  }
	  ++indScore;
  }

  if (m_targetSyntax) {
	  for (size_t i = 0; i < targetPhrase.GetSize(); ++i) {
		const Word &word = targetPhrase.GetWord(i);
		if (word.IsNonTerminal() && word != staticData.GetOutputDefaultNonTerminal()) {
			++scores[indScore];
		}
	  }
	  ++indScore;
  }

  if (m_sourceSyntax) {
	  for (size_t i = 0; i < sourcePhrase.GetSize(); ++i) {
		const Word &word = sourcePhrase.GetWord(i);
		if (word.IsNonTerminal() && word != staticData.GetInputDefaultNonTerminal()) {
			++scores[indScore];
		}
	  }
	  ++indScore;
  }

  scoreBreakdown.PlusEquals(this, scores);
}
Exemplo n.º 22
0
void LanguageModelDALM::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const{
  fullScore  = 0;
  ngramScore = 0;

  oovCount = 0;

  size_t phraseSize = phrase.GetSize();
  if (!phraseSize) return;
  
  DALMState *dalm_state = new DALMState(m_nGramOrder);
  
  size_t currPos = 0;
  size_t hist_count = 0;
  
  while (currPos < phraseSize) {
    const Word &word = phrase.GetWord(currPos);
    hist_count++;

    if (word.IsNonTerminal()) {
      // do nothing. reset ngram. needed to score target phrases during pt loading in chart decoding
      dalm_state->refresh();
      hist_count = 0;
    } else {
      if (word.GetFactor(m_factorType) == m_beginSentenceFactor) {
        // do nothing, don't include prob for <s> unigram
        if (currPos != 0) {
          std::cerr << "Either your data contains <s> in a position other than the first word or your language model is missing <s>.  Did you build your ARPA using IRSTLM and forget to run add-start-end.sh?" << std::endl;
          abort();
        }
    		m_lm->init_state(*dalm_state->get_state());
      } else {
        LMResult result = GetValue(word, dalm_state->get_state());
        fullScore += result.score;
        if (hist_count >= m_nGramOrder) ngramScore += result.score;
        if (result.unknown) ++oovCount;
      }
    }

    currPos++;
  }
	delete dalm_state;
}
Exemplo n.º 23
0
int Phrase::Compare(const Phrase &other) const
{
#ifdef min
#undef min
#endif
  size_t thisSize			= GetSize()
                        ,compareSize	= other.GetSize();
  if (thisSize != compareSize) {
    return (thisSize < compareSize) ? -1 : 1;
  }

  for (size_t pos = 0 ; pos < thisSize ; pos++) {
    const Word &thisWord	= GetWord(pos)
                            ,&otherWord	= other.GetWord(pos);
    int ret = Word::Compare(thisWord, otherWord);

    if (ret != 0)
      return ret;
  }

  return 0;
}
Exemplo n.º 24
0
void DiscriminativeLMBigramFeatureFunction::doUpdate(const Phrase& gapPhrase, const TargetGap& gap, FVector& scores)
{
    if (gap.leftHypo->GetPrevHypo()) {
        //left edge
        const TargetPhrase& leftPhrase = gap.leftHypo->GetTargetPhrase();
        scoreBigram(leftPhrase.GetWord(leftPhrase.GetSize()-1), gapPhrase.GetWord(0),scores);
    } else {
      scoreBigram(m_parent.bos(), gapPhrase.GetWord(0),scores);
    }
    //gap phrase
    size_t i = 0;
    for (; i < gapPhrase.GetSize()-1; ++i) {
        scoreBigram(gapPhrase.GetWord(i), gapPhrase.GetWord(i+1),scores);
    }
    
    //right edge
    if (gap.rightHypo) {
        scoreBigram(gapPhrase.GetWord(i),gap.rightHypo->GetTargetPhrase().GetWord(0), scores);
    } else {
      scoreBigram(gapPhrase.GetWord(i),m_parent.eos(),scores);
    }

    
}
Exemplo n.º 25
0
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
{
  // unknown word, add as trans opt
  const StaticData &staticData = StaticData::Instance();
  const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance();

  size_t isDigit = 0;
  if (staticData.GetDropUnknown()) {
    const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
    const StringPiece s = f->GetString();
    isDigit = s.find_first_of("0123456789");
    if (isDigit == string::npos)
      isDigit = 0;
    else
      isDigit = 1;
    // modify the starting bitmap
  }

  Phrase* unksrc = new Phrase(1);
  unksrc->AddWord() = sourceWord;
  Word &newWord = unksrc->GetWord(0);
  newWord.SetIsOOV(true);

  m_unksrcs.push_back(unksrc);

  //TranslationOption *transOpt;
  if (! staticData.GetDropUnknown() || isDigit) {
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      float prob = iterLHS->second;

      // lhs
      //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
      Word *targetLHS = new Word(true);

      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      // add to dictionary
      TargetPhrase *targetPhrase = new TargetPhrase();
      Word &targetWord = targetPhrase->AddWord();
      targetWord.CreateUnknownWord(sourceWord);

      // scores
      float unknownScore = FloorScore(TransformScore(prob));

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);
      targetPhrase->SetAlignmentInfo("0-0");
      if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
        targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
      }

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    } // for (iterLHS
  } else {
    // drop source word. create blank trans opt
    float unknownScore = FloorScore(-numeric_limits<float>::infinity());

    TargetPhrase *targetPhrase = new TargetPhrase();
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      //float prob = iterLHS->second;

      Word *targetLHS = new Word(true);
      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    }
  }
}
Exemplo n.º 26
0
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
  TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
  const Phrase &sourcePhrase, bool topLevel)
{
  
  bool extending = tpv->size();
  size_t bitsLeft = encodedBitStream.TellFromEnd();
    
  typedef std::pair<size_t, size_t> AlignPointSizeT;
  
  std::vector<int> sourceWords;
  if(m_coding == REnc)
  {
    for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
    {
      std::string sourceWord
        = sourcePhrase.GetWord(i).GetString(*m_input, false);
      unsigned idx = GetSourceSymbolId(sourceWord);
      sourceWords.push_back(idx);
    }
  }
  
  unsigned phraseStopSymbol = 0;
  AlignPoint alignStopSymbol(-1, -1);
  
  std::vector<float> scores;
  std::set<AlignPointSizeT> alignment;
  
  enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
  
  size_t srcSize = sourcePhrase.GetSize();
  
  TargetPhrase* targetPhrase = NULL;
  while(encodedBitStream.TellFromEnd())
  {
     
    if(state == New)
    {
      // Creating new TargetPhrase on the heap
      tpv->push_back(TargetPhrase(Output));
      targetPhrase = &tpv->back();
      
      targetPhrase->SetSourcePhrase(sourcePhrase);
      alignment.clear();
      scores.clear();
        
      state = Symbol;
    }
    
    if(state == Symbol)
    {
      unsigned symbol = m_symbolTree->Read(encodedBitStream);      
      if(symbol == phraseStopSymbol)
      {
        state = Score;
      }
      else
      {
        if(m_coding == REnc)
        {
          std::string wordString;
          size_t type = GetREncType(symbol);
          
          if(type == 1)
          {
            unsigned decodedSymbol = DecodeREncSymbol1(symbol);
            wordString = GetTargetSymbol(decodedSymbol);
          }
          else if (type == 2)
          {
            size_t rank = DecodeREncSymbol2Rank(symbol);
            size_t srcPos = DecodeREncSymbol2Position(symbol);
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = targetPhrase->GetSize();
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          else if(type == 3)
          {
            size_t rank = DecodeREncSymbol3(symbol);
            size_t srcPos = targetPhrase->GetSize();
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
                            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));   
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = srcPos;
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          
          Word word;
          word.CreateFromString(Output, *m_output, wordString, false);
          targetPhrase->AddWord(word);
        }
        else if(m_coding == PREnc)
        {
          // if the symbol is just a word
          if(GetPREncType(symbol) == 1)
          {
            unsigned decodedSymbol = DecodePREncSymbol1(symbol);
     
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(decodedSymbol), false);
            targetPhrase->AddWord(word);
          }
          // if the symbol is a subphrase pointer
          else
          {
            int left = DecodePREncSymbol2Left(symbol);
            int right = DecodePREncSymbol2Right(symbol);
            unsigned rank = DecodePREncSymbol2Rank(symbol);
            
            int srcStart = left + targetPhrase->GetSize();
            int srcEnd   = srcSize - right - 1;
            
            // false positive consistency check
            if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
              return TargetPhraseVectorPtr();
            
            // false positive consistency check
            if(m_maxRank && rank > m_maxRank)
                return TargetPhraseVectorPtr();
            
            // set subphrase by default to itself
            TargetPhraseVectorPtr subTpv = tpv;
            
            // if range smaller than source phrase retrieve subphrase
            if(unsigned(srcEnd - srcStart + 1) != srcSize)
            {
              Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
              subTpv = CreateTargetPhraseCollection(subPhrase, false);
            }
            
            // false positive consistency check
            if(subTpv != NULL && rank < subTpv->size())
            {
              // insert the subphrase into the main target phrase
              TargetPhrase& subTp = subTpv->at(rank);
              if(m_phraseDictionary.m_useAlignmentInfo)
              {
                // reconstruct the alignment data based on the alignment of the subphrase
                for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin();
                    it != subTp.GetAlignmentInfo().end(); it++)
                {
                  alignment.insert(AlignPointSizeT(srcStart + it->first,
                                                   targetPhrase->GetSize() + it->second));
                }
              }
              targetPhrase->Append(subTp);
            }
            else 
              return TargetPhraseVectorPtr();
          }
        }
        else
        {
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(symbol), false);
            targetPhrase->AddWord(word);
        }
      }
    }
    else if(state == Score)
    {
      size_t idx = m_multipleScoreTrees ? scores.size() : 0;
      float score = m_scoreTrees[idx]->Read(encodedBitStream);
      scores.push_back(score);
      
      if(scores.size() == m_numScoreComponent)
      {
        targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels);
        
        if(m_containsAlignmentInfo)
          state = Alignment;
        else
          state = Add;
      }
    }
    else if(state == Alignment)
    {
      AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
      if(alignPoint == alignStopSymbol)
      {
        state = Add;
      }
      else
      {
        if(m_phraseDictionary.m_useAlignmentInfo)  
          alignment.insert(AlignPointSizeT(alignPoint));
      }
    }
    
    if(state == Add)
    {
      if(m_phraseDictionary.m_useAlignmentInfo)
        targetPhrase->SetAlignmentInfo(alignment);
      
      if(m_coding == PREnc)
      {
        if(!m_maxRank || tpv->size() <= m_maxRank)
          bitsLeft = encodedBitStream.TellFromEnd();
        
        if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
          break;
      }
      
      if(encodedBitStream.TellFromEnd() <= 8)
        break;
      
      state = New;
    }    
  }
  
  if(m_coding == PREnc && !extending)
  {
    bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
    m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
  }
  
  return tpv;
}
  vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& source,
                              size_t sentenceid,
                              size_t nBestSize,
                              float bleuObjectiveWeight,
                              float bleuScoreWeight,
                              vector< ScoreComponentCollection>& featureValues,
                              vector< float>& bleuScores,
                              vector< float>& modelScores,
                              size_t numReturnedTranslations,
                              bool realBleu,
                              bool distinct,
                              size_t rank,
                              size_t epoch,
                              const TranslationSystem& system) {
  	// run the decoder
    m_chartManager = new ChartManager(*m_sentence, &system);
    m_chartManager->ProcessSentence();
    ChartTrellisPathList nBestList;
    m_chartManager->CalcNBest(nBestSize, nBestList, distinct);

    // read off the feature values and bleu scores for each sentence in the nbest list
    ChartTrellisPathList::const_iterator iter;
    for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
    	const Moses::ChartTrellisPath &path = **iter;
    	featureValues.push_back(path.GetScoreBreakdown());
    	float bleuScore, dynBleuScore, realBleuScore;
    	dynBleuScore = getBleuScore(featureValues.back());  
    	realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetOutputPhrase());
    	bleuScore = realBleu ? realBleuScore : dynBleuScore; 
    	bleuScores.push_back(bleuScore);

    	//std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
    	float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
    	modelScores.push_back(scoreWithoutBleu);

    	if (iter != nBestList.begin())
    	  cerr << endl;
    	cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetOutputPhrase() << "\", score: " 
    		 << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
	if (m_bleuScoreFeature->Enabled() && realBleu)
	  cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";

    	// set bleu score to zero in the feature vector since we do not want to optimise its weight
    	setBleuScore(featureValues.back(), 0);
    }

    // prepare translations to return
    vector< vector<const Word*> > translations;
    for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
        const ChartTrellisPath &path = **iter;
        Phrase phrase = path.GetOutputPhrase();

        vector<const Word*> translation;
        for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
        	const Word &word = phrase.GetWord(pos);
        	Word *newWord = new Word(word);
        	translation.push_back(newWord);
        }
        translations.push_back(translation);
    }

    return translations;
  }
Exemplo n.º 28
0
vector< vector<const Word*> > MosesDecoder::runChartDecoder(const std::string& source,
    size_t sentenceid,
    size_t nBestSize,
    float bleuObjectiveWeight,
    float bleuScoreWeight,
    vector< ScoreComponentCollection>& featureValues,
    vector< float>& bleuScores,
    vector< float>& modelScores,
    size_t numReturnedTranslations,
    bool realBleu,
    bool distinct,
    size_t rank,
    size_t epoch)
{
  // run the decoder
  m_chartManager = new ChartManager(*m_sentence);
  m_chartManager->Decode();
  ChartKBestExtractor::KBestVec nBestList;
  m_chartManager->CalcNBest(nBestSize, nBestList, distinct);

  // read off the feature values and bleu scores for each sentence in the nbest list
  for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
       p != nBestList.end(); ++p) {
    const ChartKBestExtractor::Derivation &derivation = **p;
    featureValues.push_back(*ChartKBestExtractor::GetOutputScoreBreakdown(derivation));
    float bleuScore, dynBleuScore, realBleuScore;
    dynBleuScore = getBleuScore(featureValues.back());
    Phrase outputPhrase = ChartKBestExtractor::GetOutputPhrase(derivation);
    realBleuScore = m_bleuScoreFeature->CalculateBleu(outputPhrase);
    bleuScore = realBleu ? realBleuScore : dynBleuScore;
    bleuScores.push_back(bleuScore);

    float scoreWithoutBleu = derivation.score - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
    modelScores.push_back(scoreWithoutBleu);

    if (p != nBestList.begin())
      cerr << endl;
    cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << outputPhrase << "\", score: "
         << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << derivation.score;
    if (m_bleuScoreFeature->Enabled() && realBleu)
      cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";

    // set bleu score to zero in the feature vector since we do not want to optimise its weight
    setBleuScore(featureValues.back(), 0);
  }

  // prepare translations to return
  vector< vector<const Word*> > translations;
  for (ChartKBestExtractor::KBestVec::const_iterator p = nBestList.begin();
       p != nBestList.end(); ++p) {
    const ChartKBestExtractor::Derivation &derivation = **p;
    Phrase phrase = ChartKBestExtractor::GetOutputPhrase(derivation);

    vector<const Word*> translation;
    for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
      const Word &word = phrase.GetWord(pos);
      Word *newWord = new Word(word);
      translation.push_back(newWord);
    }
    translations.push_back(translation);
  }

  return translations;
}
Exemplo n.º 29
0
vector< vector<const Word*> > MosesDecoder::runDecoder(const std::string& source,
    size_t sentenceid,
    size_t nBestSize,
    float bleuObjectiveWeight,
    float bleuScoreWeight,
    vector< ScoreComponentCollection>& featureValues,
    vector< float>& bleuScores,
    vector< float>& modelScores,
    size_t numReturnedTranslations,
    bool realBleu,
    bool distinct,
    size_t rank,
    size_t epoch,
    SearchAlgorithm& search,
    string filename)
{
  // run the decoder
  m_manager = new Moses::Manager(*m_sentence);
  m_manager->Decode();
  TrellisPathList nBestList;
  m_manager->CalcNBest(nBestSize, nBestList, distinct);

  // optionally print nbest to file (to extract scores and features.. currently just for sentence bleu scoring)
  /*if (filename != "") {
    ofstream out(filename.c_str());
    if (!out) {
      ostringstream msg;
      msg << "Unable to open " << filename;
      throw runtime_error(msg.str());
    }
    // TODO: handle sentence id (for now always 0)
    //OutputNBest(out, nBestList, StaticData::Instance().GetOutputFactorOrder(), 0, false);
    out.close();
  }*/

  // read off the feature values and bleu scores for each sentence in the nbest list
  Moses::TrellisPathList::const_iterator iter;
  for (iter = nBestList.begin() ; iter != nBestList.end() ; ++iter) {
    const Moses::TrellisPath &path = **iter;
    featureValues.push_back(path.GetScoreBreakdown());
    float bleuScore, dynBleuScore, realBleuScore;
    if (realBleu) realBleuScore = m_bleuScoreFeature->CalculateBleu(path.GetTargetPhrase());
    else dynBleuScore = getBleuScore(featureValues.back());
    bleuScore = realBleu ? realBleuScore : dynBleuScore;
    bleuScores.push_back(bleuScore);

    //std::cout << "Score breakdown: " << path.GetScoreBreakdown() << endl;
    float scoreWithoutBleu = path.GetTotalScore() - (bleuObjectiveWeight * bleuScoreWeight * bleuScore);
    modelScores.push_back(scoreWithoutBleu);

    if (iter != nBestList.begin())
      cerr << endl;
    cerr << "Rank " << rank << ", epoch " << epoch << ", \"" << path.GetTargetPhrase() << "\", score: "
         << scoreWithoutBleu << ", Bleu: " << bleuScore << ", total: " << path.GetTotalScore();
    if (m_bleuScoreFeature->Enabled() && realBleu)
      cerr << " (d-bleu: " << dynBleuScore << ", r-bleu: " << realBleuScore << ") ";

    // set bleu score to zero in the feature vector since we do not want to optimise its weight
    setBleuScore(featureValues.back(), 0);
  }

  // prepare translations to return
  vector< vector<const Word*> > translations;
  for (size_t i=0; i < numReturnedTranslations && i < nBestList.GetSize(); ++i) {
    const TrellisPath &path = nBestList.at(i);
    Phrase phrase = path.GetTargetPhrase();

    vector<const Word*> translation;
    for (size_t pos = 0; pos < phrase.GetSize(); ++pos) {
      const Word &word = phrase.GetWord(pos);
      Word *newWord = new Word(word);
      translation.push_back(newWord);
    }
    translations.push_back(translation);
  }

  return translations;
}