/* * Given a previous state, compute Bleu score for the updated state with an additional target * phrase translated. */ FFState* BleuScoreFeature::Evaluate(const Hypothesis& cur_hypo, const FFState* prev_state, ScoreComponentCollection* accumulator) const { if (!m_enabled) return new BleuScoreState(); NGrams::const_iterator reference_ngrams_iter; const BleuScoreState& ps = dynamic_cast<const BleuScoreState&>(*prev_state); BleuScoreState* new_state = new BleuScoreState(ps); float old_bleu, new_bleu; size_t num_new_words, ctx_start_idx, ctx_end_idx; // Calculate old bleu; old_bleu = CalculateBleu(new_state); // Get context and append new words. num_new_words = cur_hypo.GetCurrTargetLength(); if (num_new_words == 0) { return new_state; } Phrase new_words = ps.m_words; new_words.Append(cur_hypo.GetCurrTargetPhrase()); //cerr << "NW: " << new_words << endl; // get ngram matches for new words GetNgramMatchCounts(new_words, m_cur_ref_ngrams, new_state->m_ngram_counts, new_state->m_ngram_matches, new_state->m_words.GetSize()); // number of words in previous states // Update state variables ctx_end_idx = new_words.GetSize()-1; size_t bleu_context_length = BleuScoreState::bleu_order -1; if (ctx_end_idx > bleu_context_length) { ctx_start_idx = ctx_end_idx - bleu_context_length; } else { ctx_start_idx = 0; } WordsBitmap coverageVector = cur_hypo.GetWordsBitmap(); new_state->m_source_length = coverageVector.GetNumWordsCovered(); new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx, ctx_end_idx)); new_state->m_target_length += cur_hypo.GetCurrTargetLength(); // we need a scaled reference length to compare the current target phrase to the corresponding reference phrase new_state->m_scaled_ref_length = m_cur_ref_length * ((float)coverageVector.GetNumWordsCovered()/coverageVector.GetSize()); // Calculate new bleu. new_bleu = CalculateBleu(new_state); // Set score to new Bleu score accumulator->PlusEquals(this, new_bleu - old_bleu); return new_state; }
/** constructor; just initialize the base class */ TranslationOptionCollectionText::TranslationOptionCollectionText(Sentence const &input, size_t maxNoTransOptPerCoverage, float translationOptionThreshold) : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) { size_t size = input.GetSize(); m_inputPathMatrix.resize(size); for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) { for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) { size_t endPos = startPos + phaseSize -1; vector<InputPath*> &vec = m_inputPathMatrix[startPos]; WordsRange range(startPos, endPos); Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos))); const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos); InputPath *node; if (range.GetNumWordsCovered() == 1) { node = new InputPath(subphrase, labels, range, NULL, NULL); vec.push_back(node); } else { const InputPath &prevNode = GetInputPath(startPos, endPos - 1); node = new InputPath(subphrase, labels, range, &prevNode, NULL); vec.push_back(node); } m_phraseDictionaryQueue.push_back(node); } } }
// score ngrams around the overlap of two previously scored phrases void BleuScoreFeature::GetNgramMatchCounts_overlap(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t overlap_index) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; // Chiang et al (2008) use unclipped counts of ngram matches for (size_t end_idx = overlap_index; end_idx < phrase.GetSize(); end_idx++) { if (end_idx >= (overlap_index+BleuScoreState::bleu_order-1)) break; for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { if (order > end_idx) break; ngram_end_idx = end_idx; ngram_start_idx = end_idx - order; if (ngram_start_idx >= overlap_index) continue; // only score ngrams that span the overlap point Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) ret_matches[order]++; } } }
/* * Given a phrase (current translation) calculate its ngram counts and * its ngram matches against the ngrams in the reference translation */ void BleuScoreFeature::GetNgramMatchCounts(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t skip_first) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; // Chiang et al (2008) use unclipped counts of ngram matches for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { if (order > end_idx) break; ngram_end_idx = end_idx; ngram_start_idx = end_idx - order; Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) ret_matches[order]++; } } }
// score ngrams of words that have been added before the previous word span void BleuScoreFeature::GetNgramMatchCounts_prefix(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t new_start_indices, size_t last_end_index) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; // Chiang et al (2008) use unclipped counts of ngram matches for (size_t start_idx = 0; start_idx < new_start_indices; start_idx++) { for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { ngram_start_idx = start_idx; ngram_end_idx = start_idx + order; if (order > ngram_end_idx) break; if (ngram_end_idx > last_end_index) break; Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) ret_matches[order]++; } } }
void ChartParser::CreateInputPaths(const InputType &input) { size_t size = input.GetSize(); m_inputPathMatrix.resize(size); UTIL_THROW_IF2(input.GetType() != SentenceInput && input.GetType() != TreeInputType, "Input must be a sentence or a tree, not lattice or confusion networks"); for (size_t phaseSize = 1; phaseSize <= size; ++phaseSize) { for (size_t startPos = 0; startPos < size - phaseSize + 1; ++startPos) { size_t endPos = startPos + phaseSize -1; vector<InputPath*> &vec = m_inputPathMatrix[startPos]; WordsRange range(startPos, endPos); Phrase subphrase(input.GetSubString(WordsRange(startPos, endPos))); const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos); InputPath *node; if (range.GetNumWordsCovered() == 1) { node = new InputPath(subphrase, labels, range, NULL, NULL); vec.push_back(node); } else { const InputPath &prevNode = GetInputPath(startPos, endPos - 1); node = new InputPath(subphrase, labels, range, &prevNode, NULL); vec.push_back(node); } //m_inputPathQueue.push_back(node); } } }
std::vector<float> LexicalReorderingTableMemory::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) { //rather complicated because of const can't use []... as [] might enter new things into std::map //also can't have to be careful with words range if c is empty can't use c.GetSize()-1 will underflow and be large TableType::const_iterator r; std::string key; if(0 == c.GetSize()) { key = MakeKey(f,e,c); r = m_Table.find(key); if(m_Table.end() != r) { return r->second; } } else { //right try from large to smaller context for(size_t i = 0; i <= c.GetSize(); ++i) { Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); key = MakeKey(f,e,sub_c); r = m_Table.find(key); if(m_Table.end() != r) { return r->second; } } } return Scores(); }
std::vector<float> LexicalReorderingTableCompact::GetScore(const Phrase& f, const Phrase& e, const Phrase& c) { std::string key; Scores scores; if(0 == c.GetSize()) key = MakeKey(f, e, c); else for(size_t i = 0; i <= c.GetSize(); ++i) { Phrase sub_c(c.GetSubString(WordsRange(i,c.GetSize()-1))); key = MakeKey(f,e,sub_c); } size_t index = m_hash[key]; if(m_hash.GetSize() != index) { std::string scoresString; if(m_inMemory) scoresString = m_scoresMemory[index]; else scoresString = m_scoresMapped[index]; BitWrapper<> bitStream(scoresString); for(size_t i = 0; i < m_numScoreComponent; i++) scores.push_back(m_scoreTrees[m_multipleScoreTrees ? i : 0]->Read(bitStream)); return scores; } return Scores(); }
WordsRange TrellisPath::GetTargetWordsRange(const Hypothesis &hypo) const { size_t startPos = 0; for (int indEdge = (int) m_path.size() - 1 ; indEdge >= 0 ; --indEdge) { const Hypothesis *currHypo = m_path[indEdge]; size_t endPos = startPos + currHypo->GetCurrTargetLength() - 1; if (currHypo == &hypo) { return WordsRange(startPos, endPos); } startPos = endPos + 1; } // have to give a hypo in the trellis path, but u didn't. CHECK(false); return WordsRange(NOT_FOUND, NOT_FOUND); }
void LexicalReorderingTableTree::Cache(const Sentence& input){ //only works with sentences... size_t prev_cache_size = m_Cache.size(); size_t max_phrase_length = input.GetSize(); for(size_t len = 0; len <= max_phrase_length; ++len){ for(size_t start = 0; start+len <= input.GetSize(); ++start){ Phrase f = input.GetSubString(WordsRange(start, start+len)); auxCacheForSrcPhrase(f); } } std::cerr << "Cached " << m_Cache.size() - prev_cache_size << " new primary reordering table keys\n"; }
/** Score due to flip. Again, left and right refer to order on the <emph>target</emph> side. */ void LexicalReorderingFeatureFunction::doFlipUpdate( const TranslationOption* leftOption, const TranslationOption* rightOption, const TargetGap& leftGap, const TargetGap& rightGap, FVector& scores) { if (leftGap.segment.GetEndPos() + 1 == rightGap.segment.GetStartPos()) { TargetGap gap(leftGap.leftHypo,rightGap.rightHypo, WordsRange(leftGap.segment.GetStartPos(),rightGap.segment.GetEndPos())); doContiguousPairedUpdate(leftOption,rightOption,gap,scores); } else { doDiscontiguousPairedUpdate(leftOption,rightOption,leftGap,rightGap,scores); } }
const FFState* DistortionScoreProducer::EmptyHypothesisState(const InputType &input) const { // fake previous translated phrase start and end size_t start = NOT_FOUND; size_t end = NOT_FOUND; if (input.m_frontSpanCoveredLength > 0) { // can happen with --continue-partial-translation start = 0; end = input.m_frontSpanCoveredLength -1; } return new DistortionState_traditional( WordsRange(start, end), NOT_FOUND); }
/** Score due to flip. Again, left and right refer to order on the <emph>target</emph> side. */ void DiscriminativeLMBigramFeatureFunction::doFlipUpdate(const TranslationOption* leftOption,const TranslationOption* rightOption, const TargetGap& leftGap, const TargetGap& rightGap, FVector& scores) { if (leftGap.segment.GetEndPos()+1 == rightGap.segment.GetStartPos()) { //contiguous Phrase gapPhrase(leftOption->GetTargetPhrase()); gapPhrase.Append(rightOption->GetTargetPhrase()); TargetGap gap(leftGap.leftHypo, rightGap.rightHypo, WordsRange(leftGap.segment.GetStartPos(), rightGap.segment.GetEndPos())); doUpdate(gapPhrase,gap,scores); } else { //discontiguous doUpdate(leftOption->GetTargetPhrase(), leftGap,scores); doUpdate(rightOption->GetTargetPhrase(), rightGap,scores); } }
void BleuScoreFeature::GetClippedNgramMatchesAndCounts(Phrase& phrase, const NGrams& ref_ngram_counts, std::vector< size_t >& ret_counts, std::vector< size_t >& ret_matches, size_t skip_first) const { NGrams::const_iterator ref_ngram_counts_iter; size_t ngram_start_idx, ngram_end_idx; Matches ngram_matches; for (size_t end_idx = skip_first; end_idx < phrase.GetSize(); end_idx++) { for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { if (order > end_idx) break; ngram_end_idx = end_idx; ngram_start_idx = end_idx - order; Phrase ngram = phrase.GetSubString(WordsRange(ngram_start_idx, ngram_end_idx), 0); ret_counts[order]++; ref_ngram_counts_iter = ref_ngram_counts.find(ngram); if (ref_ngram_counts_iter != ref_ngram_counts.end()) { ngram_matches[order][ngram]++; } } } // clip ngram matches for (size_t order = 0; order < BleuScoreState::bleu_order; order++) { NGrams::const_iterator iter; // iterate over ngram counts for every ngram order for (iter=ngram_matches[order].begin(); iter != ngram_matches[order].end(); ++iter) { ref_ngram_counts_iter = ref_ngram_counts.find(iter->first); if (iter->second > ref_ngram_counts_iter->second) { ret_matches[order] += ref_ngram_counts_iter->second; } else { ret_matches[order] += iter->second; } } } }
void ChartRuleLookupManagerOnDisk::GetChartRuleCollection( const WordsRange &range, bool adhereTableLimit, ChartTranslationOptionList &outColl) { const StaticData &staticData = StaticData::Instance(); size_t rulesLimit = staticData.GetRuleLimit(); size_t relEndPos = range.GetEndPos() - range.GetStartPos(); size_t absEndPos = range.GetEndPos(); // MAIN LOOP. create list of nodes of target phrases DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()]; // sort save nodes so only do nodes with most counts expandableDottedRuleList.SortSavedNodes(); const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl(); //cerr << "savedNodeColl=" << savedNodeColl.size() << " "; const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel(); for (size_t ind = 0; ind < (savedNodeColl.size()) ; ++ind) { const SavedNodeOnDisk &savedNode = *savedNodeColl[ind]; const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule(); const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode(); size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1; // search for terminal symbol if (startPos == absEndPos) { OnDiskPt::Word *sourceWordBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceWordLabel.GetLabel()); if (sourceWordBerkeleyDb != NULL) { const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper); if (node != NULL) { // TODO figure out why source word is needed from node, not from sentence // prob to do with factors or non-term //const Word &sourceWord = node->GetSourceWord(); DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, sourceWordLabel, prevDottedRule); expandableDottedRuleList.Add(relEndPos+1, dottedRule); // cache for cleanup m_sourcePhraseNode.push_back(node); } delete sourceWordBerkeleyDb; } } // search for non-terminals size_t endPos, stackInd; if (startPos > absEndPos) continue; else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) { // start. endPos = absEndPos - 1; stackInd = relEndPos; } else { endPos = absEndPos; stackInd = relEndPos + 1; } // size_t nonTermNumWordsCovered = endPos - startPos + 1; // get target nonterminals in this span from chart const ChartCellLabelSet &chartNonTermSet = GetCellCollection().Get(WordsRange(startPos, endPos)).GetTargetLabelSet(); //const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal() // ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal(); // go through each SOURCE lhs const NonTerminalSet &sourceLHSSet = GetSentence().GetLabelSet(startPos, endPos); NonTerminalSet::const_iterator iterSourceLHS; for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) { const Word &sourceLHS = *iterSourceLHS; OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS); if (sourceLHSBerkeleyDb == NULL) { delete sourceLHSBerkeleyDb; continue; // vocab not in pt. node definately won't be in there } const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper); delete sourceLHSBerkeleyDb; if (sourceNode == NULL) continue; // didn't find source node // go through each TARGET lhs ChartCellLabelSet::const_iterator iterChartNonTerm; for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) { const ChartCellLabel &cellLabel = *iterChartNonTerm; //cerr << sourceLHS << " " << defaultSourceNonTerm << " " << chartNonTerm << " " << defaultTargetNonTerm << endl; //bool isSyntaxNonTerm = (sourceLHS != defaultSourceNonTerm) || (chartNonTerm != defaultTargetNonTerm); bool doSearch = true; //isSyntaxNonTerm ? nonTermNumWordsCovered <= maxSyntaxSpan : // nonTermNumWordsCovered <= maxDefaultSpan; if (doSearch) { OnDiskPt::Word *chartNonTermBerkeleyDb = m_dbWrapper.ConvertFromMoses(Output, m_outputFactorsVec, cellLabel.GetLabel()); if (chartNonTermBerkeleyDb == NULL) continue; const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper); delete chartNonTermBerkeleyDb; if (node == NULL) continue; // found matching entry //const Word &sourceWord = node->GetSourceWord(); DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, cellLabel, prevDottedRule); expandableDottedRuleList.Add(stackInd, dottedRule); m_sourcePhraseNode.push_back(node); } } // for (iterChartNonTerm delete sourceNode; } // for (iterLabelListf // return list of target phrases DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1); // source LHS DottedRuleCollOnDisk::const_iterator iterDottedRuleColl; for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) { // node of last source word const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl; if (prevDottedRule.Done()) continue; prevDottedRule.Done(true); const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode(); //get node for each source LHS const NonTerminalSet &lhsSet = GetSentence().GetLabelSet(range.GetStartPos(), range.GetEndPos()); NonTerminalSet::const_iterator iterLabelSet; for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) { const Word &sourceLHS = *iterLabelSet; OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS); if (sourceLHSBerkeleyDb == NULL) continue; const TargetPhraseCollection *targetPhraseCollection = NULL; const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper); if (node) { UINT64 tpCollFilePos = node->GetValue(); std::map<UINT64, const TargetPhraseCollection*>::const_iterator iterCache = m_cache.find(tpCollFilePos); if (iterCache == m_cache.end()) { const OnDiskPt::TargetPhraseCollection *tpcollBerkeleyDb = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper); targetPhraseCollection = tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec ,m_outputFactorsVec ,m_dictionary ,m_weight ,m_wpProducer ,*m_languageModels ,m_filePath , m_dbWrapper.GetVocab()); delete tpcollBerkeleyDb; m_cache[tpCollFilePos] = targetPhraseCollection; } else { // just get out of cache targetPhraseCollection = iterCache->second; } assert(targetPhraseCollection); if (!targetPhraseCollection->IsEmpty()) { outColl.Add(*targetPhraseCollection, prevDottedRule, GetCellCollection(), adhereTableLimit, rulesLimit); } } // if (node) delete node; delete sourceLHSBerkeleyDb; } } } // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind) outColl.CreateChartRules(rulesLimit); //cerr << numDerivations << " "; }
int Sentence::Read(std::istream& in,const std::vector<FactorType>& factorOrder) { // const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); std::string line; std::map<std::string, std::string> meta; if (getline(in, line, '\n').eof()) return 0; //get covered words - if continual-partial-translation is switched on, parse input const StaticData &staticData = StaticData::Instance(); m_frontSpanCoveredLength = 0; m_sourceCompleted.resize(0); if (staticData.ContinuePartialTranslation()) { string initialTargetPhrase; string sourceCompletedStr; int loc1 = line.find( "|||", 0 ); int loc2 = line.find( "|||", loc1 + 3 ); if (loc1 > -1 && loc2 > -1) { initialTargetPhrase = line.substr(0, loc1); sourceCompletedStr = line.substr(loc1 + 3, loc2 - loc1 - 3); line = line.substr(loc2 + 3); sourceCompletedStr = Trim(sourceCompletedStr); initialTargetPhrase = Trim(initialTargetPhrase); m_initialTargetPhrase = initialTargetPhrase; int len = sourceCompletedStr.size(); m_sourceCompleted.resize(len); int contiguous = 1; for (int i = 0; i < len; ++i) { if (sourceCompletedStr.at(i) == '1') { m_sourceCompleted[i] = true; if (contiguous) m_frontSpanCoveredLength ++; } else { m_sourceCompleted[i] = false; contiguous = 0; } } } } // remove extra spaces line = Trim(line); // if sentences is specified as "<seg id=1> ... </seg>", extract id meta = ProcessAndStripSGML(line); if (meta.find("id") != meta.end()) { this->SetTranslationId(atol(meta["id"].c_str())); } if (meta.find("docid") != meta.end()) { this->SetDocumentId(atol(meta["docid"].c_str())); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(false); } if (meta.find("topic") != meta.end()) { vector<string> topic_params; boost::split(topic_params, meta["topic"], boost::is_any_of("\t ")); if (topic_params.size() == 1) { this->SetTopicId(atol(topic_params[0].c_str())); this->SetUseTopicId(true); this->SetUseTopicIdAndProb(false); } else { this->SetTopicIdAndProb(topic_params); this->SetUseTopicId(false); this->SetUseTopicIdAndProb(true); } } if (meta.find("weight-setting") != meta.end()) { this->SetWeightSetting(meta["weight-setting"]); this->SetSpecifiesWeightSetting(true); staticData.SetWeightSetting(meta["weight-setting"]); } else { this->SetSpecifiesWeightSetting(false); } // parse XML markup in translation line //const StaticData &staticData = StaticData::Instance(); std::vector< size_t > xmlWalls; std::vector< std::pair<size_t, std::string> > placeholders; if (staticData.GetXmlInputType() != XmlPassThrough) { int offset = 0; if (staticData.IsChart()) { offset = 1; } if (!ProcessAndStripXMLTags(line, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders, offset, staticData.GetXmlBrackets().first, staticData.GetXmlBrackets().second)) { const string msg("Unable to parse XML in line: " + line); TRACE_ERR(msg << endl); throw runtime_error(msg); } } // Phrase::CreateFromString(Input, factorOrder, line, factorDelimiter, NULL); Phrase::CreateFromString(Input, factorOrder, line, NULL); // placeholders ProcessPlaceholders(placeholders); if (staticData.IsChart()) { InitStartEndWord(); } //now that we have final word positions in phrase (from CreateFromString), //we can make input phrase objects to go with our XmlOptions and create TranslationOptions //only fill the vector if we are parsing XML if (staticData.GetXmlInputType() != XmlPassThrough ) { for (size_t i=0; i<GetSize(); i++) { m_xmlCoverageMap.push_back(false); } //iterXMLOpts will be empty for XmlIgnore //look at each column for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin(); iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) { const XmlOption *xmlOption = *iterXmlOpts; const WordsRange &range = xmlOption->range; for(size_t j=range.GetStartPos(); j<=range.GetEndPos(); j++) { m_xmlCoverageMap[j]=true; } } } // reordering walls and zones m_reorderingConstraint.InitializeWalls( GetSize() ); // set reordering walls, if "-monotone-at-punction" is set if (staticData.UseReorderingConstraint() && GetSize()>0) { m_reorderingConstraint.SetMonotoneAtPunctuation( GetSubString( WordsRange(0,GetSize()-1 ) ) ); } // set walls obtained from xml for(size_t i=0; i<xmlWalls.size(); i++) if( xmlWalls[i] < GetSize() ) // no buggy walls, please m_reorderingConstraint.SetWall( xmlWalls[i], true ); m_reorderingConstraint.FinalizeWalls(); return 1; }
void ChartRuleLookupManagerMemory::GetChartRuleCollection( const WordsRange &range, ChartTranslationOptionList &outColl) { size_t relEndPos = range.GetEndPos() - range.GetStartPos(); size_t absEndPos = range.GetEndPos(); // MAIN LOOP. create list of nodes of target phrases // get list of all rules that apply to spans at same starting position DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()]; const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList(); const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel(); // loop through the rules // (note that expandableDottedRuleList can be expanded as the loop runs // through calls to ExtendPartialRuleApplication()) for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) { // rule we are about to extend const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind]; // we will now try to extend it, starting after where it ended size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1; // search for terminal symbol // (if only one more word position needs to be covered) if (startPos == absEndPos) { // look up in rule dictionary, if the current rule can be extended // with the source word in the last position const Word &sourceWord = sourceWordLabel.GetLabel(); const PhraseDictionaryNodeSCFG *node = prevDottedRule.GetLastNode().GetChild(sourceWord); // if we found a new rule -> create it and add it to the list if (node != NULL) { // create the rule #ifdef USE_BOOST_POOL DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc(); new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel, prevDottedRule); #else DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node, sourceWordLabel, prevDottedRule); #endif dottedRuleCol.Add(relEndPos+1, dottedRule); } } // search for non-terminals size_t endPos, stackInd; // span is already complete covered? nothing can be done if (startPos > absEndPos) continue; else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) { // We're at the root of the prefix tree so won't try to cover the full // span (i.e. we don't allow non-lexical unary rules). However, we need // to match non-unary rules that begin with a non-terminal child, so we // do that in two steps: during this iteration we search for non-terminals // that cover all but the last source word in the span (there won't // already be running nodes for these because that would have required a // non-lexical unary rule match for an earlier span). Any matches will // result in running nodes being appended to the list and on subsequent // iterations (for this same span), we'll extend them to cover the final // word. endPos = absEndPos - 1; stackInd = relEndPos; } else { endPos = absEndPos; stackInd = relEndPos + 1; } ExtendPartialRuleApplication(prevDottedRule, startPos, endPos, stackInd, dottedRuleCol); } // list of rules that that cover the entire span DottedRuleList &rules = dottedRuleCol.Get(relEndPos + 1); // look up target sides for the rules DottedRuleList::const_iterator iterRule; for (iterRule = rules.begin(); iterRule != rules.end(); ++iterRule) { const DottedRuleInMemory &dottedRule = **iterRule; const PhraseDictionaryNodeSCFG &node = dottedRule.GetLastNode(); // look up target sides const TargetPhraseCollection *tpc = node.GetTargetPhraseCollection(); // add the fully expanded rule (with lexical target side) if (tpc != NULL) { AddCompletedRule(dottedRule, *tpc, range, outColl); } } dottedRuleCol.Clear(relEndPos+1); outColl.ShrinkToLimit(); }
// Given a partial rule application ending at startPos-1 and given the sets of // source and target non-terminals covering the span [startPos, endPos], // determines the full or partial rule applications that can be produced through // extending the current rule application by a single non-terminal. void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication( const DottedRuleInMemory &prevDottedRule, size_t startPos, size_t endPos, size_t stackInd, DottedRuleColl & dottedRuleColl) { // source non-terminal labels for the remainder const NonTerminalSet &sourceNonTerms = GetSentence().GetLabelSet(startPos, endPos); // target non-terminal labels for the remainder const ChartCellLabelSet &targetNonTerms = GetCellCollection().Get(WordsRange(startPos, endPos)).GetTargetLabelSet(); // note where it was found in the prefix tree of the rule dictionary const PhraseDictionaryNodeSCFG &node = prevDottedRule.GetLastNode(); const PhraseDictionaryNodeSCFG::NonTerminalMap & nonTermMap = node.GetNonTerminalMap(); const size_t numChildren = nonTermMap.size(); if (numChildren == 0) { return; } const size_t numSourceNonTerms = sourceNonTerms.size(); const size_t numTargetNonTerms = targetNonTerms.GetSize(); const size_t numCombinations = numSourceNonTerms * numTargetNonTerms; // We can search by either: // 1. Enumerating all possible source-target NT pairs that are valid for // the span and then searching for matching children in the node, // or // 2. Iterating over all the NT children in the node, searching // for each source and target NT in the span's sets. // We'll do whichever minimises the number of lookups: if (numCombinations <= numChildren*2) { // loop over possible source non-terminal labels (as found in input tree) NonTerminalSet::const_iterator p = sourceNonTerms.begin(); NonTerminalSet::const_iterator sEnd = sourceNonTerms.end(); for (; p != sEnd; ++p) { const Word & sourceNonTerm = *p; // loop over possible target non-terminal labels (as found in chart) ChartCellLabelSet::const_iterator q = targetNonTerms.begin(); ChartCellLabelSet::const_iterator tEnd = targetNonTerms.end(); for (; q != tEnd; ++q) { const ChartCellLabel &cellLabel = q->second; // try to match both source and target non-terminal const PhraseDictionaryNodeSCFG * child = node.GetChild(sourceNonTerm, cellLabel.GetLabel()); // nothing found? then we are done if (child == NULL) { continue; } // create new rule #ifdef USE_BOOST_POOL DottedRuleInMemory *rule = m_dottedRulePool.malloc(); new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel, prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } } } else { // loop over possible expansions of the rule PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p; PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end = nonTermMap.end(); for (p = nonTermMap.begin(); p != end; ++p) { // does it match possible source and target non-terminals? const PhraseDictionaryNodeSCFG::NonTerminalMapKey &key = p->first; const Word &sourceNonTerm = key.first; if (sourceNonTerms.find(sourceNonTerm) == sourceNonTerms.end()) { continue; } const Word &targetNonTerm = key.second; const ChartCellLabel *cellLabel = targetNonTerms.Find(targetNonTerm); if (!cellLabel) { continue; } // create new rule const PhraseDictionaryNodeSCFG &child = p->second; #ifdef USE_BOOST_POOL DottedRuleInMemory *rule = m_dottedRulePool.malloc(); new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel, prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } } }
void SparseHieroReorderingFeature::EvaluateChart( const ChartHypothesis& cur_hypo , ScoreComponentCollection* accumulator) const { // get index map for underlying hypotheses //const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = // cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().GetNonTermIndexMap(); //The Huck features. For a rule with source side: // abXcdXef //We first have to split into blocks: // ab X cd X ef //Then we extract features based in the boundary words of the neighbouring blocks //For the block pair, we use the right word of the left block, and the left //word of the right block. //Need to get blocks, and their alignment. Each block has a word range (on the // on the source), a non-terminal flag, and a set of alignment points in the target phrase //We need to be able to map source word position to target word position, as //much as possible (don't need interior of non-terminals). The alignment info //objects just give us the mappings between *rule* positions. So if we can //map source word position to source rule position, and target rule position //to target word position, then we can map right through. size_t sourceStart = cur_hypo.GetCurrSourceRange().GetStartPos(); size_t sourceSize = cur_hypo.GetCurrSourceRange().GetNumWordsCovered(); vector<WordsRange> sourceNTSpans; for (size_t prevHypoId = 0; prevHypoId < cur_hypo.GetPrevHypos().size(); ++prevHypoId) { sourceNTSpans.push_back(cur_hypo.GetPrevHypo(prevHypoId)->GetCurrSourceRange()); } //put in source order. Is this necessary? sort(sourceNTSpans.begin(), sourceNTSpans.end()); //cerr << "Source NTs: "; //for (size_t i = 0; i < sourceNTSpans.size(); ++i) cerr << sourceNTSpans[i] << " "; //cerr << endl; typedef pair<WordsRange,bool> Block;//flag indicates NT vector<Block> sourceBlocks; sourceBlocks.push_back(Block(cur_hypo.GetCurrSourceRange(),false)); for (vector<WordsRange>::const_iterator i = sourceNTSpans.begin(); i != sourceNTSpans.end(); ++i) { const WordsRange& prevHypoRange = *i; Block lastBlock = sourceBlocks.back(); sourceBlocks.pop_back(); //split this range into before NT, NT and after NT if (prevHypoRange.GetStartPos() > lastBlock.first.GetStartPos()) { sourceBlocks.push_back(Block(WordsRange(lastBlock.first.GetStartPos(),prevHypoRange.GetStartPos()-1),false)); } sourceBlocks.push_back(Block(prevHypoRange,true)); if (prevHypoRange.GetEndPos() < lastBlock.first.GetEndPos()) { sourceBlocks.push_back(Block(WordsRange(prevHypoRange.GetEndPos()+1,lastBlock.first.GetEndPos()), false)); } } /* cerr << "Source Blocks: "; for (size_t i = 0; i < sourceBlocks.size(); ++i) cerr << sourceBlocks[i].first << " " << (sourceBlocks[i].second ? "NT" : "T") << " "; cerr << endl; */ //Mapping from source word to target rule position vector<size_t> sourceWordToTargetRulePos(sourceSize); map<size_t,size_t> alignMap; alignMap.insert( cur_hypo.GetCurrTargetPhrase().GetAlignTerm().begin(), cur_hypo.GetCurrTargetPhrase().GetAlignTerm().end()); alignMap.insert( cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().begin(), cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm().end()); //vector<size_t> alignMapTerm = cur_hypo.GetCurrTargetPhrase().GetAlignNonTerm() size_t sourceRulePos = 0; //cerr << "SW->RP "; for (vector<Block>::const_iterator sourceBlockIt = sourceBlocks.begin(); sourceBlockIt != sourceBlocks.end(); ++sourceBlockIt) { for (size_t sourceWordPos = sourceBlockIt->first.GetStartPos(); sourceWordPos <= sourceBlockIt->first.GetEndPos(); ++sourceWordPos) { sourceWordToTargetRulePos[sourceWordPos - sourceStart] = alignMap[sourceRulePos]; // cerr << sourceWordPos - sourceStart << "-" << alignMap[sourceRulePos] << " "; if (! sourceBlockIt->second) { //T ++sourceRulePos; } } if ( sourceBlockIt->second) { //NT ++sourceRulePos; } } //cerr << endl; //Iterate through block pairs const Sentence& sentence = dynamic_cast<const Sentence&>(cur_hypo.GetManager().GetSource()); //const TargetPhrase& targetPhrase = cur_hypo.GetCurrTargetPhrase(); for (size_t i = 0; i < sourceBlocks.size()-1; ++i) { Block& leftSourceBlock = sourceBlocks[i]; Block& rightSourceBlock = sourceBlocks[i+1]; size_t sourceLeftBoundaryPos = leftSourceBlock.first.GetEndPos(); size_t sourceRightBoundaryPos = rightSourceBlock.first.GetStartPos(); const Word& sourceLeftBoundaryWord = sentence.GetWord(sourceLeftBoundaryPos); const Word& sourceRightBoundaryWord = sentence.GetWord(sourceRightBoundaryPos); sourceLeftBoundaryPos -= sourceStart; sourceRightBoundaryPos -= sourceStart; // Need to figure out where these map to on the target. size_t targetLeftRulePos = sourceWordToTargetRulePos[sourceLeftBoundaryPos]; size_t targetRightRulePos = sourceWordToTargetRulePos[sourceRightBoundaryPos]; bool isMonotone = true; if ((sourceLeftBoundaryPos < sourceRightBoundaryPos && targetLeftRulePos > targetRightRulePos) || ((sourceLeftBoundaryPos > sourceRightBoundaryPos && targetLeftRulePos < targetRightRulePos))) { isMonotone = false; } stringstream buf; buf << "h_"; //sparse reordering, Huck if (m_type == SourceLeft || m_type == SourceCombined) { buf << GetFactor(sourceLeftBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString(); buf << "_"; } if (m_type == SourceRight || m_type == SourceCombined) { buf << GetFactor(sourceRightBoundaryWord,m_sourceVocab,m_sourceFactor)->GetString(); buf << "_"; } buf << (isMonotone ? "M" : "S"); accumulator->PlusEquals(this,buf.str(), 1); } // cerr << endl; }
void ChartRuleLookupManagerMemory::GetChartRuleCollection( const WordsRange &range, bool adhereTableLimit, ChartTranslationOptionList &outColl) { size_t relEndPos = range.GetEndPos() - range.GetStartPos(); size_t absEndPos = range.GetEndPos(); // MAIN LOOP. create list of nodes of target phrases ProcessedRuleColl &processedRuleCol = *m_processedRuleColls[range.GetStartPos()]; const ProcessedRuleList &runningNodes = processedRuleCol.GetRunningNodes(); // Note that runningNodes can be expanded as the loop runs (through calls to // ExtendPartialRuleApplication()). for (size_t ind = 0; ind < runningNodes.size(); ++ind) { const ProcessedRule &prevProcessedRule = *runningNodes[ind]; const PhraseDictionaryNodeSCFG &prevNode = prevProcessedRule.GetLastNode(); const WordConsumed *prevWordConsumed = prevProcessedRule.GetLastWordConsumed(); size_t startPos = (prevWordConsumed == NULL) ? range.GetStartPos() : prevWordConsumed->GetWordsRange().GetEndPos() + 1; // search for terminal symbol if (startPos == absEndPos) { const Word &sourceWord = GetSentence().GetWord(absEndPos); const PhraseDictionaryNodeSCFG *node = prevNode.GetChild(sourceWord); if (node != NULL) { WordConsumed *newWordConsumed = new WordConsumed(absEndPos, absEndPos , sourceWord , prevWordConsumed); ProcessedRule *processedRule = new ProcessedRule(*node, newWordConsumed); processedRuleCol.Add(relEndPos+1, processedRule); } } // search for non-terminals size_t endPos, stackInd; if (startPos > absEndPos) continue; else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) { // start. endPos = absEndPos - 1; stackInd = relEndPos; } else { endPos = absEndPos; stackInd = relEndPos + 1; } const NonTerminalSet &sourceNonTerms = GetSentence().GetLabelSet(startPos, endPos); const NonTerminalSet &targetNonTerms = GetCellCollection().GetHeadwords(WordsRange(startPos, endPos)); ExtendPartialRuleApplication(prevNode, prevWordConsumed, startPos, endPos, stackInd, sourceNonTerms, targetNonTerms, processedRuleCol); } // return list of target phrases ProcessedRuleList &nodes = processedRuleCol.Get(relEndPos + 1); size_t rulesLimit = StaticData::Instance().GetRuleLimit(); ProcessedRuleList::const_iterator iterNode; for (iterNode = nodes.begin(); iterNode != nodes.end(); ++iterNode) { const ProcessedRule &processedRule = **iterNode; const PhraseDictionaryNodeSCFG &node = processedRule.GetLastNode(); const WordConsumed *wordConsumed = processedRule.GetLastWordConsumed(); assert(wordConsumed); const TargetPhraseCollection *targetPhraseCollection = node.GetTargetPhraseCollection(); if (targetPhraseCollection != NULL) { outColl.Add(*targetPhraseCollection, *wordConsumed, adhereTableLimit, rulesLimit); } } outColl.CreateChartRules(rulesLimit); }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml */ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) // keep this handy for later const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); if (startPos >= endPos) { TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl); return false; } // may be either a input span label ("label"), or a specified output translation "translation" string label = ParseXmlTagAttribute(tagContent,"label"); string translation = ParseXmlTagAttribute(tagContent,"translation"); // specified label if (translation.length() == 0 && label.length() > 0) { WordsRange range(startPos,endPos-1); // really? XMLParseOutput item(label, range); sourceLabels.push_back(item); } // specified translations -> vector of phrases, separated by "||" if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) { vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||"); vector<string> altLabel = TokenizeMultiCharSeparator(label, "||"); vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); //TRACE_ERR("number of translations: " << altTexts.size() << endl); for (size_t i=0; i<altTexts.size(); ++i) { // set target phrase TargetPhrase targetPhrase; targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL); // set constituent label string targetLHSstr; if (altLabel.size() > i && altLabel[i].size() > 0) { targetLHSstr = altLabel[i]; } else { const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS = lhsList.begin(); targetLHSstr = iterLHS->first; } Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, outputFactorOrder, targetLHSstr, true); CHECK(targetLHS->GetFactor(0) != NULL); targetPhrase.SetTargetLHS(targetLHS); // not tested Phrase sourcePhrase = this->GetSubString(WordsRange(startPos,endPos-1)); // get probability float probValue = 1; if (altProbs.size() > i && altProbs[i].size() > 0) { probValue = Scan<float>(altProbs[i]); } // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); targetPhrase.SetXMLScore(scoreValue); targetPhrase.Evaluate(sourcePhrase); // set span and create XmlOption WordsRange range(startPos+1,endPos); XmlOption *option = new XmlOption(range,targetPhrase); CHECK(option); xmlOptions.push_back(option); VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl); } altTexts.clear(); altProbs.clear(); } } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl); return false; } // return de-xml'ed sentence in line line = cleanLine; return true; }
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }
FFState* BleuScoreFeature::EvaluateChart(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection* accumulator ) const { if (!m_enabled) return new BleuScoreState(); NGrams::const_iterator reference_ngrams_iter; const Phrase& curr_target_phrase = static_cast<const Phrase&>(cur_hypo.GetCurrTargetPhrase()); // cerr << "\nCur target phrase: " << cur_hypo.GetTargetLHS() << " --> " << curr_target_phrase << endl; // Calculate old bleu of previous states float old_bleu = 0, new_bleu = 0; size_t num_old_words = 0, num_words_first_prev = 0; size_t num_words_added_left = 0, num_words_added_right = 0; // double-check cases where more than two previous hypotheses were combined assert(cur_hypo.GetPrevHypos().size() <= 2); BleuScoreState* new_state; if (cur_hypo.GetPrevHypos().size() == 0) new_state = new BleuScoreState(); else { const FFState* prev_state_zero = cur_hypo.GetPrevHypo(0)->GetFFState(featureID); const BleuScoreState& ps_zero = dynamic_cast<const BleuScoreState&>(*prev_state_zero); new_state = new BleuScoreState(ps_zero); num_words_first_prev = ps_zero.m_target_length; for (size_t i = 0; i < cur_hypo.GetPrevHypos().size(); ++i) { const FFState* prev_state = cur_hypo.GetPrevHypo(i)->GetFFState(featureID); const BleuScoreState* ps = dynamic_cast<const BleuScoreState*>(prev_state); BleuScoreState* ps_nonConst = const_cast<BleuScoreState*>(ps); // cerr << "prev phrase: " << cur_hypo.GetPrevHypo(i)->GetOutputPhrase() // << " ( " << cur_hypo.GetPrevHypo(i)->GetTargetLHS() << ")" << endl; old_bleu += CalculateBleu(ps_nonConst); num_old_words += ps->m_target_length; if (i > 0) // add ngram matches from other previous states new_state->AddNgramCountAndMatches(ps_nonConst->m_ngram_counts, ps_nonConst->m_ngram_matches); } } // check if we are already done (don't add <s> and </s>) size_t numWordsCovered = cur_hypo.GetCurrSourceRange().GetNumWordsCovered(); if (numWordsCovered == m_cur_source_length) { // Bleu score stays the same, do not need to add anything //accumulator->PlusEquals(this, 0); return new_state; } // set new context Phrase new_words = cur_hypo.GetOutputPhrase(); new_state->m_words = new_words; size_t num_curr_words = new_words.GetSize(); // get ngram matches for new words if (num_old_words == 0) { // cerr << "compute right ngram context" << endl; GetNgramMatchCounts(new_words, m_cur_ref_ngrams, new_state->m_ngram_counts, new_state->m_ngram_matches, 0); } else if (new_words.GetSize() == num_old_words) { // two hypotheses were glued together, compute new ngrams on the basis of first hypothesis num_words_added_right = num_curr_words - num_words_first_prev; // score around overlap point // cerr << "compute overlap ngram context (" << (num_words_first_prev) << ")" << endl; GetNgramMatchCounts_overlap(new_words, m_cur_ref_ngrams, new_state->m_ngram_counts, new_state->m_ngram_matches, num_words_first_prev); } else if (num_old_words + curr_target_phrase.GetNumTerminals() == num_curr_words) { assert(curr_target_phrase.GetSize() == curr_target_phrase.GetNumTerminals()+1); // previous hypothesis + rule with 1 non-terminal were combined (NT substituted by Ts) for (size_t i = 0; i < curr_target_phrase.GetSize(); ++i) if (curr_target_phrase.GetWord(i).IsNonTerminal()) { num_words_added_left = i; num_words_added_right = curr_target_phrase.GetSize() - (i+1); break; } // left context // cerr << "compute left ngram context" << endl; if (num_words_added_left > 0) GetNgramMatchCounts_prefix(new_words, m_cur_ref_ngrams, new_state->m_ngram_counts, new_state->m_ngram_matches, num_words_added_left, num_curr_words - num_words_added_right - 1); // right context // cerr << "compute right ngram context" << endl; if (num_words_added_right > 0) GetNgramMatchCounts(new_words, m_cur_ref_ngrams, new_state->m_ngram_counts, new_state->m_ngram_matches, num_words_added_left + num_old_words); } else { cerr << "undefined state.. " << endl; exit(1); } // Update state variables size_t ctx_start_idx = 0; size_t ctx_end_idx = new_words.GetSize()-1; size_t bleu_context_length = BleuScoreState::bleu_order -1; if (ctx_end_idx > bleu_context_length) { ctx_start_idx = ctx_end_idx - bleu_context_length; } new_state->m_source_length = cur_hypo.GetCurrSourceRange().GetNumWordsCovered(); new_state->m_words = new_words.GetSubString(WordsRange(ctx_start_idx, ctx_end_idx)); new_state->m_target_length = cur_hypo.GetOutputPhrase().GetSize(); // we need a scaled reference length to compare the current target phrase to the corresponding // reference phrase size_t cur_source_length = m_cur_source_length; new_state->m_scaled_ref_length = m_cur_ref_length * (float(new_state->m_source_length)/cur_source_length); // Calculate new bleu. new_bleu = CalculateBleu(new_state); // Set score to new Bleu score accumulator->PlusEquals(this, new_bleu - old_bleu); return new_state; }
// Create the InputTree::Node objects but do not connect them. void InputTreeBuilder::CreateNodes(const TreeInput &in, const std::string &topLevelLabel, InputTree &out) { // Get the input sentence word count. This includes the <s> and </s> symbols. const std::size_t numWords = in.GetSize(); // Get the parse tree non-terminal nodes. The parse tree covers the original // sentence only, not the <s> and </s> symbols, so at this point there is // no top-level node. std::vector<XMLParseOutput> xmlNodes = in.GetLabelledSpans(); // Sort the XML nodes into post-order. Prior to sorting they will be in the // order that TreeInput created them. Usually that will be post-order, but // if, for example, the tree was binarized by relax-parse then it won't be. // In all cases, we assume that if two nodes cover the same span then the // first one is the lowest. SortXmlNodesIntoPostOrder(xmlNodes); // Copy the parse tree non-terminal nodes, but offset the ranges by 1 (to // allow for the <s> symbol at position 0). std::vector<XMLParseOutput> nonTerms; nonTerms.reserve(xmlNodes.size()+1); for (std::vector<XMLParseOutput>::const_iterator p = xmlNodes.begin(); p != xmlNodes.end(); ++p) { std::size_t start = p->m_range.GetStartPos(); std::size_t end = p->m_range.GetEndPos(); nonTerms.push_back(XMLParseOutput(p->m_label, WordsRange(start+1, end+1))); } // Add a top-level node that also covers <s> and </s>. nonTerms.push_back(XMLParseOutput(topLevelLabel, WordsRange(0, numWords-1))); // Allocate space for the InputTree nodes. In the case of out.nodes, this // step is essential because once created the PVertex objects must not be // moved around (through vector resizing) because InputTree keeps pointers // to them. out.nodes.reserve(numWords + nonTerms.size()); out.nodesAtPos.resize(numWords); // Create the InputTree::Node objects. int prevStart = -1; int prevEnd = -1; for (std::vector<XMLParseOutput>::const_iterator p = nonTerms.begin(); p != nonTerms.end(); ++p) { int start = static_cast<int>(p->m_range.GetStartPos()); int end = static_cast<int>(p->m_range.GetEndPos()); // Check if we've started ascending a new subtree. if (start != prevStart && end != prevEnd) { // Add a node for each terminal to the left of or below the first // nonTerm child of the subtree. for (int i = prevEnd+1; i <= end; ++i) { PVertex v(WordsRange(i, i), in.GetWord(i)); out.nodes.push_back(InputTree::Node(v)); out.nodesAtPos[i].push_back(&out.nodes.back()); } } // Add a node for the non-terminal. Word w(true); w.CreateFromString(Moses::Output, m_outputFactorOrder, p->m_label, true); PVertex v(WordsRange(start, end), w); out.nodes.push_back(InputTree::Node(v)); out.nodesAtPos[start].push_back(&out.nodes.back()); prevStart = start; prevEnd = end; } }