std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) { std::string key; Scores scores; if(0 == c.GetSize()) key = MakeKey(f, e, c); else for(size_t i = 0; i <= c.GetSize(); ++i) { Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); key = MakeKey(f,e,sub_c); } size_t index = m_hash[key]; if(m_hash.GetSize() != index) { std::string scoresString; if(m_inMemory) scoresString = m_scoresMemory[index]; else scoresString = m_scoresMapped[index]; BitWrapper<> bitStream(scoresString); for(size_t i = 0; i < m_numScoreComponent; i++) scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream)); return scores; } return Scores(); }
std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) { //rather complicated because of const can't use []... as [] might enter new things into std::map //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large TableType::const_iterator r; std::string key; if(0 == c.GetSize()) { key = MakeKey(f,e,c); r = m_Table.find(key); if(m_Table.end() != r) { return r->second; } } else { //right try from large to smaller context for(size_t i = 0; i <= c.GetSize(); ++i) { Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); key = MakeKey(f,e,sub_c); r = m_Table.find(key); if(m_Table.end() != r) { return r->second; } } } return Scores(); }
/** * Calculate real sentence Bleu score of complete translation */ float BleuScoreFeature::CalculateBleu(Phrase translation) const { if (translation.GetSize() == 0) return 0.0; Phrase normTranslation = translation; // remove start and end symbol for chart decoding if (m_cur_source_length != m_cur_norm_source_length) { WordsRange* range = new WordsRange(1, translation.GetSize()-2); normTranslation = translation.GetSubString(*range); } // get ngram matches for translation BleuScoreState* state = new BleuScoreState(); GetClippedNgramMatchesAndCounts(normTranslation, m_cur_ref_ngrams, state->m_ngram_counts, state->m_ngram_matches, 0); // number of words in previous states // set state variables state->m_words = normTranslation; state->m_source_length = m_cur_norm_source_length; state->m_target_length = normTranslation.GetSize(); state->m_scaled_ref_length = m_cur_ref_length; // Calculate bleu. return CalculateBleu(state); }
// score ngrams around the overlap of two previously scored phrases void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t overlap_index) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; // Chiang et al (2008) use unclipped counts of ngram matches for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) { if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break; for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { if (order > end_idx) break; ngram_end_idx = end_idx; ngram_start_idx = end_idx - order; if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) ret_matches[order]++; } } }
// score ngrams of words that have been added before the previous word span void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t new_start_indices, size_t last_end_index) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; // Chiang et al (2008) use unclipped counts of ngram matches for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) { for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { ngram_start_idx = start_idx; ngram_end_idx = start_idx + order; if (order > ngram_end_idx) break; if (ngram_end_idx > last_end_index) break; Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) ret_matches[order]++; } } }
/* * Given a phrase (current translation) calculate its ngram counts and * its ngram matches against the ngrams in the reference translation */ void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t skip_first) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; // Chiang et al (2008) use unclipped counts of ngram matches for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { if (order > end_idx) break; ngram_end_idx = end_idx; ngram_start_idx = end_idx - order; Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) ret_matches[order]++; } } }
void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t skip_first) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; Matches ngram_matches; for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { if (order > end_idx) break; ngram_end_idx = end_idx; ngram_start_idx = end_idx - order; Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) { ngram_matches[order][ngram]++; } } } // clip ngram matches for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { NGrams::const_iterator iter; // iterate over ngram counts for every ngram order for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) { ref_ngram_counts_iter = ref_ngram_counts.find(iter->first); if (iter->second > ref_ngram_counts_iter->second) { ret_matches[order] += ref_ngram_counts_iter->second; } else { ret_matches[order] += iter->second; } } } }
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }