uint AddWord( const uint sentno, const AnchoredPhrasePair &app, const uint maxLength ) { WordAlignment wa = app.second.get().getWordAlignment(); PhraseData sd = app.second.get().getSourcePhrase().get(); PhraseData td = app.second.get().getTargetPhrase().get(); uint addCount = 0; for(uint j = 0; j < sd.size(); ++j) { // just for testing: words longer than 5 characters .... (should use some other criteria here!) if((maxLength == 0) || (sd[j].size() > maxLength)) { for(WordAlignment::const_iterator wit = wa.begin_for_source(j); wit != wa.end_for_source(j); ++wit ) { sentWords[sentno].push_back(td[*wit]); addCount++; // LOG(logger_, debug, "add word " << td[*wit] << " aligned to " << sd[j]); } } } return addCount; }
PhraseSegmentation BeamSearchAdapter::search(boost::shared_ptr<const PhrasePairCollection> ppairs, const std::vector<Word> &sentence) const { if(sentence.empty()) return PhraseSegmentation(); std::stringstream sntstream; std::copy(sentence.begin(), sentence.end() - 1, std::ostream_iterator<Word>(sntstream, " ")); sntstream << sentence.back(); boost::scoped_ptr<Moses::Sentence> msent(new Moses::Sentence()); std::vector<Moses::FactorType> ftype(1, 0); //msent.Read(sntstream, ftype); msent->CreateFromString(ftype, sntstream.str(), "|"); const Moses::TranslationSystem &system = Moses::StaticData::Instance().GetTranslationSystem(Moses::TranslationSystem::DEFAULT); boost::scoped_ptr<Moses::Manager> manager(new Moses::Manager(0, *msent, Moses::StaticData::Instance().GetSearchAlgorithm(), &system)); manager->ProcessSentence(); const Moses::Hypothesis *hypo = manager->GetBestHypothesis(); CompareAnchoredPhrasePairs comparePhrasePairs; typedef std::vector<AnchoredPhrasePair> PPVector; PPVector ppvec; ppairs->copyPhrasePairs(std::back_inserter(ppvec)); std::sort(ppvec.begin(), ppvec.end(), comparePhrasePairs); PhraseSegmentation seg; if(hypo == NULL) LOG(logger_, error, "No answer from moses."); while(hypo && hypo->GetPrevHypo() != NULL) { CoverageBitmap cov(sentence.size()); const Moses::WordsRange &mrange = hypo->GetCurrSourceWordsRange(); for(uint i = mrange.GetStartPos(); i <= mrange.GetEndPos(); i++) cov.set(i); PhraseData srcpd; const Moses::Phrase *msrcphr = hypo->GetSourcePhrase(); for(uint i = 0; i < msrcphr->GetSize(); i++) srcpd.push_back(msrcphr->GetFactor(i, 0)->GetString()); PhraseData tgtpd; const Moses::Phrase &mtgtphr = hypo->GetCurrTargetPhrase(); for(uint i = 0; i < mtgtphr.GetSize(); i++) tgtpd.push_back(mtgtphr.GetFactor(i, 0)->GetString()); PPVector::const_iterator it = std::lower_bound(ppvec.begin(), ppvec.end(), CompareAnchoredPhrasePairs::PhrasePairKey(cov, srcpd, tgtpd), comparePhrasePairs); seg.push_front(*it); hypo = hypo->GetPrevHypo(); } return seg; }
uint AddWord( const uint sentno, const uint phrno, const AnchoredPhrasePair &app, const std::string pos, const int historySize ) { WordAlignment wa = app.second.get().getWordAlignment(); PhraseData sd = app.second.get().getSourcePhrase().get(); PhraseData td = app.second.get().getTargetPhrase().get(); uint wordno = app.first.find_first(); uint addCount = 0; for(uint j = 0; j < sd.size(); ++j) { // TODO: we could support other conditins here as well! if(posTags[sentno][wordno] == pos) { for(WordAlignment::const_iterator wit = wa.begin_for_source(j); wit != wa.end_for_source(j); ++wit ) { std::string wordPair = sd[j] + "_" + td[*wit]; long long b = FindVocabularyPosition(wordPair); if(b < 0) continue; SelectedWordVector word(phrno,*wit,sd[j],td[*wit],wordno,size); FindVector(b,word.vec); selectedWords[sentno].push_back(word); selectedWords[sentno].back().similarity = MaxSimilarityWithHistory( sentno, selectedWords[sentno].size()-1, historySize ); // currentScore += word.similarity; addCount++; //LOG(logger_, debug, "add word " << td[*wit] << " aligned to " << sd[j]); } } wordno++; } return addCount; };
PhraseSegmentation NistXmlStateInitialiser::initSegmentation( boost::shared_ptr<const PhrasePairCollection> phraseTranslations, const std::vector<Word> &sentence, int documentNumber, int sentenceNumber ) const { if(sentence.empty()) return PhraseSegmentation(); std::vector<AnchoredPhrasePair> ppvec; phraseTranslations->copyPhrasePairs(std::back_inserter(ppvec)); CompareAnchoredPhrasePairs ppComparator; std::sort(ppvec.begin(), ppvec.end(), ppComparator); PhraseSegmentation seg; PhraseData tgtpd; for(PlainTextDocument::const_word_iterator it = documents_[documentNumber].sentence_begin(sentenceNumber); it != documents_[documentNumber].sentence_end(sentenceNumber); ++it ) { if((*it).substr(0, 1) != "|") { // word tgtpd.push_back(*it); continue; } // end of hypothesis Word token((*it).substr(1, (*it).length()-2)); std::vector<Word> srctokenrange; // metadata boost::split( srctokenrange, token, boost::is_any_of("-"), boost::token_compress_on ); PhraseData srcpd; CoverageBitmap cov(sentence.size()); try { if(srctokenrange.size() != 2) { BOOST_THROW_EXCEPTION(FileFormatException()); } for(uint i = boost::lexical_cast<uint>(srctokenrange.front()); i <= boost::lexical_cast<uint>(srctokenrange.back()); ++i ) { srcpd.push_back(sentence[i]); cov.set(i); } } catch(boost::exception &) { LOG(logger_, error, "Invalid alignment data in raw-translation file " "(document " << documentNumber << ", " " sentence " << sentenceNumber << "): " << *it ); throw; } std::vector<AnchoredPhrasePair>::const_iterator appit = std::lower_bound( ppvec.begin(), ppvec.end(), CompareAnchoredPhrasePairs::PhrasePairKey(cov, srcpd, tgtpd), ppComparator ); seg.push_back(*appit); tgtpd.clear(); } return seg; }