std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(const Phrase &sourcePhrase, const string &outDir) const { std::vector<TargetPhrase*> ret; string outPath = outDir + "/out.txt"; ifstream outStream(outPath.c_str()); string line; while (getline(outStream, line)) { vector<string> toks; Tokenize(toks, line, "\t"); UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore"); TargetPhrase *tp = new TargetPhrase(); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, toks[0], false); float score = Scan<float>(toks[1]); tp->GetScoreBreakdown().PlusEquals(this, score); // score of all other ff when this rule is being loaded tp->Evaluate(sourcePhrase, GetFeaturesToApply()); ret.push_back(tp); } outStream.close(); return ret; }
void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats) const //fill fs and allStats with statistics from models { for(size_t i = 0; i < m_numModels; ++i) { const PhraseDictionary &pd = *m_pd[i]; TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src); if (ret_raw != NULL) { TargetPhraseCollection::iterator iterTargetPhrase; for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != ret_raw->end(); ++iterTargetPhrase) { TargetPhrase * targetPhrase = *iterTargetPhrase; vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd); string targetString = targetPhrase->GetStringRep(m_output); if (allStats->find(targetString) == allStats->end()) { multiModelCountsStatistics * statistics = new multiModelCountsStatistics; statistics->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info // zero out scores from original phrase table statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd); statistics->fst.resize(m_numModels); statistics->ft.resize(m_numModels); Scores scoreVector(5); scoreVector[0] = -raw_scores[0]; scoreVector[1] = -raw_scores[1]; scoreVector[2] = -raw_scores[2]; statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); // set scores to 0 statistics->targetPhrase->Evaluate(src, GetFeaturesToApply()); (*allStats)[targetString] = statistics; } multiModelCountsStatistics * statistics = (*allStats)[targetString]; statistics->fst[i] = UntransformScore(raw_scores[0]); statistics->ft[i] = UntransformScore(raw_scores[1]); fs[i] = UntransformScore(raw_scores[2]); (*allStats)[targetString] = statistics; } } } // get target phrase frequency for models which have not seen the phrase pair for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelCountsStatistics * statistics = iter->second; for (size_t i = 0; i < m_numModels; ++i) { if (!statistics->ft[i]) { statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i); } } } }
TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const { // create a target phrase from the 1st word of the source, prefix with 'SkeletonPT:' CHECK(sourcePhrase.GetSize()); CHECK(m_output.size() == 1); string str = sourcePhrase.GetWord(0).GetFactor(0)->GetString().as_string(); str = "SkeletonPT:" + str; TargetPhrase *tp = new TargetPhrase(); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, str, false); // score for this phrase table vector<float> scores(m_numScoreComponents, 1.3); tp->GetScoreBreakdown().PlusEquals(this, scores); // score of all other ff when this rule is being loaded tp->Evaluate(sourcePhrase, GetFeaturesToApply()); return tp; }
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const { TargetPhraseCollection *ret = new TargetPhraseCollection(); for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelCountsStatistics * statistics = iter->second; if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) { UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables."); } try { pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm()); vector< set<size_t> > alignedToT = alignment.first; vector< set<size_t> > alignedToS = alignment.second; double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false ); double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true ); Scores scoreVector(5); scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0]))); scoreVector[1] = FloorScore(TransformScore(lexst)); scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2]))); scoreVector[3] = FloorScore(TransformScore(lexts)); scoreVector[4] = FloorScore(TransformScore(2.718)); statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); statistics->targetPhrase->Evaluate(src, GetFeaturesToApply()); } catch (AlignmentException& e) { continue; } ret->Add(new TargetPhrase(*statistics->targetPhrase)); } RemoveAllInMap(*allStats); delete allStats; return ret; }
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) { char dirName[] = "/tmp/moses.XXXXXX"; char *temp = mkdtemp(dirName); UTIL_THROW_IF2(temp == NULL, "Couldn't create temporary directory " << dirName); string dirNameStr(dirName); string inFileName(dirNameStr + "/in"); ofstream inFile(inFileName.c_str()); for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { inFile << inputSentence.GetWord(i); } inFile << endl; inFile.close(); long translationId = inputSentence.GetTranslationId(); string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); // populate with rules for this sentence PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; FormatType format = MosesFormat; // data from file InputFileStream inStream(ptFileName); // copied from class LoaderStandard PrintUserTime("Start loading fuzzy-match phrase model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line UTIL_THROW(util::Exception, "Cannot be Hiero format"); //line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ptFileName << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, "Number of scores incorrectly specified"); // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(rootNode); //removedirectoryrecursively(dirName); }