Ejemplo n.º 1
0
	uint AddWord(
		const uint sentno,
		const AnchoredPhrasePair &app,
		const uint maxLength
	) {
		WordAlignment wa = app.second.get().getWordAlignment();
		PhraseData sd = app.second.get().getSourcePhrase().get();
		PhraseData td = app.second.get().getTargetPhrase().get();

		uint addCount = 0;
		for(uint j = 0; j < sd.size(); ++j) {
			// just for testing: words longer than 5 characters .... (should use some other criteria here!)
			if((maxLength == 0) || (sd[j].size() > maxLength)) {
				for(WordAlignment::const_iterator
					wit = wa.begin_for_source(j);
					wit != wa.end_for_source(j);
					++wit
				) {
					sentWords[sentno].push_back(td[*wit]);
					addCount++;
					// LOG(logger_, debug, "add word " << td[*wit] << " aligned to " << sd[j]);
				}
			}
		}
		return addCount;
	}
Ejemplo n.º 2
0
PhraseSegmentation BeamSearchAdapter::search(boost::shared_ptr<const PhrasePairCollection> ppairs, const std::vector<Word> &sentence) const {
    if(sentence.empty())
        return PhraseSegmentation();

    std::stringstream sntstream;
    std::copy(sentence.begin(), sentence.end() - 1, std::ostream_iterator<Word>(sntstream, " "));
    sntstream << sentence.back();

    boost::scoped_ptr<Moses::Sentence> msent(new Moses::Sentence());
    std::vector<Moses::FactorType> ftype(1, 0);
    //msent.Read(sntstream, ftype);
    msent->CreateFromString(ftype, sntstream.str(), "|");

    const Moses::TranslationSystem &system =
        Moses::StaticData::Instance().GetTranslationSystem(Moses::TranslationSystem::DEFAULT);
    boost::scoped_ptr<Moses::Manager> manager(new Moses::Manager(0, *msent, Moses::StaticData::Instance().GetSearchAlgorithm(), &system));
    manager->ProcessSentence();
    const Moses::Hypothesis *hypo = manager->GetBestHypothesis();

    CompareAnchoredPhrasePairs comparePhrasePairs;
    typedef std::vector<AnchoredPhrasePair> PPVector;
    PPVector ppvec;
    ppairs->copyPhrasePairs(std::back_inserter(ppvec));
    std::sort(ppvec.begin(), ppvec.end(), comparePhrasePairs);
    PhraseSegmentation seg;
    if(hypo == NULL)
        LOG(logger_, error, "No answer from moses.");
    while(hypo && hypo->GetPrevHypo() != NULL) {
        CoverageBitmap cov(sentence.size());
        const Moses::WordsRange &mrange = hypo->GetCurrSourceWordsRange();
        for(uint i = mrange.GetStartPos(); i <= mrange.GetEndPos(); i++)
            cov.set(i);

        PhraseData srcpd;
        const Moses::Phrase *msrcphr = hypo->GetSourcePhrase();
        for(uint i = 0; i < msrcphr->GetSize(); i++)
            srcpd.push_back(msrcphr->GetFactor(i, 0)->GetString());

        PhraseData tgtpd;
        const Moses::Phrase &mtgtphr = hypo->GetCurrTargetPhrase();
        for(uint i = 0; i < mtgtphr.GetSize(); i++)
            tgtpd.push_back(mtgtphr.GetFactor(i, 0)->GetString());

        PPVector::const_iterator it = std::lower_bound(ppvec.begin(), ppvec.end(),
                                      CompareAnchoredPhrasePairs::PhrasePairKey(cov, srcpd, tgtpd),
                                      comparePhrasePairs);
        seg.push_front(*it);

        hypo = hypo->GetPrevHypo();
    }

    return seg;
}
Ejemplo n.º 3
0
	uint AddWord(
		const uint sentno,
		const uint phrno,
		const AnchoredPhrasePair &app,
		const std::string pos,
		const int historySize
	) {
		WordAlignment wa = app.second.get().getWordAlignment();
		PhraseData sd = app.second.get().getSourcePhrase().get();
		PhraseData td = app.second.get().getTargetPhrase().get();

		uint wordno = app.first.find_first();

		uint addCount = 0;
		for(uint j = 0; j < sd.size(); ++j) {
			// TODO: we could support other conditins here as well!
			if(posTags[sentno][wordno] == pos) {
				for(WordAlignment::const_iterator
					wit = wa.begin_for_source(j);
					wit != wa.end_for_source(j);
					++wit
				) {
					std::string wordPair = sd[j] + "_" + td[*wit];
					long long b = FindVocabularyPosition(wordPair);
					if(b < 0)
						continue;
					SelectedWordVector word(phrno,*wit,sd[j],td[*wit],wordno,size);
					FindVector(b,word.vec);
					selectedWords[sentno].push_back(word);
					selectedWords[sentno].back().similarity = MaxSimilarityWithHistory(
						sentno,
						selectedWords[sentno].size()-1,
						historySize
					);
					// currentScore += word.similarity;
					addCount++;
					//LOG(logger_, debug, "add word " << td[*wit] << " aligned to " << sd[j]);
				}
			}
			wordno++;
		}
		return addCount;
	};
Ejemplo n.º 4
0
PhraseSegmentation
NistXmlStateInitialiser::initSegmentation(
	boost::shared_ptr<const PhrasePairCollection> phraseTranslations,
	const std::vector<Word> &sentence,
	int documentNumber,
	int sentenceNumber
) const {
	if(sentence.empty())
		return PhraseSegmentation();

	std::vector<AnchoredPhrasePair> ppvec;
	phraseTranslations->copyPhrasePairs(std::back_inserter(ppvec));

	CompareAnchoredPhrasePairs ppComparator;
	std::sort(ppvec.begin(), ppvec.end(), ppComparator);

	PhraseSegmentation seg;
	PhraseData tgtpd;
	for(PlainTextDocument::const_word_iterator
		it = documents_[documentNumber].sentence_begin(sentenceNumber);
		it != documents_[documentNumber].sentence_end(sentenceNumber);
		++it
	) {
		if((*it).substr(0, 1) != "|") { // word
			tgtpd.push_back(*it);
			continue;
		}
		// end of hypothesis
		Word token((*it).substr(1, (*it).length()-2));
		std::vector<Word> srctokenrange; // metadata
		boost::split(
			srctokenrange,
			token,
			boost::is_any_of("-"),
			boost::token_compress_on
		);
		PhraseData srcpd;
		CoverageBitmap cov(sentence.size());
		try {
			if(srctokenrange.size() != 2) {
				BOOST_THROW_EXCEPTION(FileFormatException());
			}
			for(uint
				i = boost::lexical_cast<uint>(srctokenrange.front());
				i <= boost::lexical_cast<uint>(srctokenrange.back());
				++i
			) {
				srcpd.push_back(sentence[i]);
				cov.set(i);
			}
		} catch(boost::exception &) {
			LOG(logger_, error,
				"Invalid alignment data in raw-translation file "
				"(document " << documentNumber << ", "
				" sentence " << sentenceNumber << "): "
				<< *it
			);
			throw;
		}
		std::vector<AnchoredPhrasePair>::const_iterator
			appit = std::lower_bound(
				ppvec.begin(),
				ppvec.end(),
				CompareAnchoredPhrasePairs::PhrasePairKey(cov, srcpd, tgtpd),
				ppComparator
			);
		seg.push_back(*appit);
		tgtpd.clear();
	}
	return seg;
}