const TargetPhraseCollection* PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const { // There is no souch source phrase if source phrase is longer than longest // observed source phrase during compilation if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength()) return NULL; // Retrieve target phrase collection from phrase table TargetPhraseVectorPtr decodedPhraseColl = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true); if(decodedPhraseColl != NULL && decodedPhraseColl->size()) { TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl)); TargetPhraseCollection* phraseColl = new TargetPhraseCollection(); // Score phrases and if possible apply ttable_limit TargetPhraseVector::iterator nth = (m_tableLimit == 0 || tpv->size() < m_tableLimit) ? tpv->end() : tpv->begin() + m_tableLimit; std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase()); for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++) phraseColl->Add(new TargetPhrase(*it)); // Cache phrase pair for for clean-up or retrieval with PREnc const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(sourcePhrase, phraseColl); return phraseColl; } else return NULL; }
void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) { TargetPhraseCollection *tpc = new TargetPhraseCollection(); tpc->Add(&phrase); waste_memory.push_back(tpc); StackVec empty; Add(*tpc, empty, range); }
TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const { TargetPhraseCollection *ret = new TargetPhraseCollection(); for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelStatistics * statistics = iter->second; Scores scoreVector(m_numScoreComponents); for(size_t i = 0; i < m_numScoreComponents; ++i) { scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0)); } statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); //correct future cost estimates and total score vector<FeatureFunction*> pd_feature; pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this)); const vector<FeatureFunction*> pd_feature_const(pd_feature); statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const); ret->Add(new TargetPhrase(*statistics->targetPhrase)); } return ret; }
void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &inputPath) const { const Phrase &sourcePhrase = inputPath.GetPhrase(); size_t hash = hash_value(sourcePhrase); CacheColl &cache = GetCache(); std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter; iter = cache.find(hash); if (iter != cache.end()) { // already in cache const TargetPhraseCollection *tpColl = iter->second.first; inputPath.SetTargetPhrases(*this, tpColl, NULL); } else { // TRANSLITERATE char *ptr = tmpnam(NULL); string inFile(ptr); ptr = tmpnam(NULL); string outDir(ptr); ofstream inStream(inFile.c_str()); inStream << sourcePhrase.ToString() << endl; inStream.close(); string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" + " --transliteration-model-dir " + m_filePath + " --moses-src-dir " + m_mosesDir + " --external-bin-dir " + m_externalDir + " --input-extension " + m_inputLang + " --output-extension " + m_outputLang + " --oov-file " + inFile + " --out-dir " + outDir; int ret = system(cmd.c_str()); UTIL_THROW_IF2(ret != 0, "Transliteration script error"); TargetPhraseCollection *tpColl = new TargetPhraseCollection(); vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir); vector<TargetPhrase*>::const_iterator iter; for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) { TargetPhrase *tp = *iter; tpColl->Add(tp); } std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock()); cache[hash] = value; inputPath.SetTargetPhrases(*this, tpColl, NULL); // clean up temporary files remove(inFile.c_str()); cmd = "rm -rf " + outDir; system(cmd.c_str()); } }
void SkeletonPT::GetTargetPhraseCollectionBatch(const InputPathList &phraseDictionaryQueue) const { InputPathList::const_iterator iter; for (iter = phraseDictionaryQueue.begin(); iter != phraseDictionaryQueue.end(); ++iter) { InputPath &inputPath = **iter; TargetPhrase *tp = CreateTargetPhrase(inputPath.GetPhrase()); TargetPhraseCollection *tpColl = new TargetPhraseCollection(); tpColl->Add(tp); m_allTPColl.push_back(tpColl); inputPath.SetTargetPhrases(*this, tpColl, NULL); } }
const TargetPhraseCollection* PhraseDictionaryDynSuffixArray:: GetTargetPhraseCollectionLEGACY(const Phrase& src) const { typedef map<SAPhrase, vector<float> >::value_type pstat_entry; map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics m_biSA->GatherCands(src,pstats); TargetPhraseCollection *ret = new TargetPhraseCollection(); BOOST_FOREACH(pstat_entry & e, pstats) { TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this); tp->GetScoreBreakdown().Assign(this,e.second); tp->EvaluateInIsolation(src); ret->Add(tp); }
std::vector <ChartTranslationOptions*> Sentence::GetXmlChartTranslationOptions() const { const StaticData &staticData = StaticData::Instance(); std::vector <ChartTranslationOptions*> ret; // XML Options // this code is a copy of the 1 in Sentence. //only fill the vector if we are parsing XML if (staticData.GetXmlInputType() != XmlPassThrough ) { //TODO: needed to handle exclusive //for (size_t i=0; i<GetSize(); i++) { // m_xmlCoverageMap.push_back(false); //} //iterXMLOpts will be empty for XmlIgnore //look at each column for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin(); iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) { const XmlOption &xmlOption = **iterXmlOpts; TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase); WordsRange *range = new WordsRange(xmlOption.range); StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted TargetPhraseCollection *tpc = new TargetPhraseCollection; tpc->Add(targetPhrase); ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f); ret.push_back(transOpt); //TODO: needed to handle exclusive //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) { // m_xmlCoverageMap[j]=true; //} } } return ret; }
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const { TargetPhraseCollection *ret = new TargetPhraseCollection(); for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelCountsStatistics * statistics = iter->second; if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) { UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables."); } try { pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm()); vector< set<size_t> > alignedToT = alignment.first; vector< set<size_t> > alignedToS = alignment.second; double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false ); double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true ); Scores scoreVector(5); scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0]))); scoreVector[1] = FloorScore(TransformScore(lexst)); scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2]))); scoreVector[3] = FloorScore(TransformScore(lexts)); scoreVector[4] = FloorScore(TransformScore(2.718)); statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); statistics->targetPhrase->Evaluate(src, GetFeaturesToApply()); } catch (AlignmentException& e) { continue; } ret->Add(new TargetPhrase(*statistics->targetPhrase)); } RemoveAllInMap(*allStats); delete allStats; return ret; }
//! populate this InputType with data from in stream int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder) { const StaticData &staticData = StaticData::Instance(); string line; if (getline(in, line, '\n').eof()) return 0; // remove extra spaces //line = Trim(line); std::vector<XMLParseOutput> sourceLabels; std::vector<XmlOption*> xmlOptionsList; ProcessAndStripXMLTags(line, sourceLabels, xmlOptionsList); // do words 1st - hack stringstream strme; strme << line << endl; Sentence::Read(strme, factorOrder); // size input chart size_t sourceSize = GetSize(); m_sourceChart.resize(sourceSize); for (size_t pos = 0; pos < sourceSize; ++pos) { m_sourceChart[pos].resize(sourceSize - pos); } // do source labels vector<XMLParseOutput>::const_iterator iterLabel; for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) { const XMLParseOutput &labelItem = *iterLabel; const WordsRange &range = labelItem.m_range; const string &label = labelItem.m_label; AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder); } // default label for (size_t startPos = 0; startPos < sourceSize; ++startPos) { for (size_t endPos = startPos; endPos < sourceSize; ++endPos) { AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder); } } // XML Options //only fill the vector if we are parsing XML if (staticData.GetXmlInputType() != XmlPassThrough ) { //TODO: needed to handle exclusive //for (size_t i=0; i<GetSize(); i++) { // m_xmlCoverageMap.push_back(false); //} //iterXMLOpts will be empty for XmlIgnore //look at each column for(std::vector<XmlOption*>::const_iterator iterXmlOpts = xmlOptionsList.begin(); iterXmlOpts != xmlOptionsList.end(); iterXmlOpts++) { const XmlOption *xmlOption = *iterXmlOpts; TargetPhrase *targetPhrase = new TargetPhrase(xmlOption->targetPhrase); *targetPhrase = xmlOption->targetPhrase; // copy everything WordsRange *range = new WordsRange(xmlOption->range); const StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted TargetPhraseCollection *tpc = new TargetPhraseCollection; tpc->Add(targetPhrase); ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f); m_xmlChartOptionsList.push_back(transOpt); //TODO: needed to handle exclusive //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) { // m_xmlCoverageMap[j]=true; //} delete xmlOption; } } return 1; }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing(); const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL); size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info const std::string& factorDelimiter = staticData.GetFactorDelimiter(); Phrase sourcePhrase(0); std::vector<float> scv; scv.reserve(m_numScoreComponent); TargetPhraseCollection *preSourceNode = NULL; std::string preSourceString; while(true) { ++line_num; StringPiece line; try { line = inFile.ReadLine(); } catch (util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||")); StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece scoreString(GrabOrDie(pipes, filePath, line_num)); bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t")); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n"); continue; } //target std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase()); targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter); scv.clear(); for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) { char *err_ind; // Token is always delimited by some form of space. Also, apparently strtod is portable but strtof isn't. scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind))))); if (err_ind == token->data()) { stringstream strme; strme << "Bad number " << token << " on line " << line_num; UserMessage::Add(strme.str()); abort(); } } if (scv.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } size_t consumed = 3; if (pipes) { targetPhrase->SetAlignmentInfo(*pipes++); ++consumed; } ScoreComponentCollection sparse; if (pipes) pipes++; //counts if (pipes) { //sparse features SparsePhraseDictionaryFeature* spdf = GetFeature()->GetSparsePhraseDictionaryFeature(); if (spdf) { sparse.Assign(spdf,(pipes++)->as_string()); } } // scv good to go sir! targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels); // Check number of entries delimited by ||| agrees across all lines. for (; pipes; ++pipes, ++consumed) {} if (numElement != consumed) { if (numElement == NOT_FOUND) { numElement = consumed; } else { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } } //TODO: Would be better to reuse source phrases, but ownership has to be //consistent across phrase table implementations sourcePhrase.Clear(); sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter); //Now that the source phrase is ready, we give the target phrase a copy targetPhrase->SetSourcePhrase(sourcePhrase); if (preSourceString == sourcePhraseString && preSourceNode) { preSourceNode->Add(targetPhrase.release()); } else { preSourceNode = CreateTargetPhraseCollection(sourcePhrase); preSourceNode->Add(targetPhrase.release()); preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size()); } } // sort each target phrase collection m_collection.Sort(m_tableLimit); /* // TODO ASK OLIVER WHY THIS IS NEEDED const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing(); */ return true; }
void PhraseDictionaryNewFormat::AddEquivPhrase(TargetPhraseCollection &targetPhraseColl, TargetPhrase *targetPhrase) { targetPhraseColl.Add(targetPhrase); }