float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const { const Sentence& input = *(m_local->input); float score = 0; for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) { float sum = 0; const Word& targetWord = targetPhrase.GetWord( targetIndex ); VERBOSE(2,"glm " << targetWord << ": "); const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord ); if( targetWordHash != m_hash.end() ) { SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias ); if( inputWordHash != targetWordHash->second.end() ) { VERBOSE(2,"*BIAS* " << inputWordHash->second); sum += inputWordHash->second; } set< const Word*, WordComparer > alreadyScored; // do not score a word twice for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) { const Word& inputWord = input.GetWord( inputIndex ); if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) { SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord ); if( inputWordHash != targetWordHash->second.end() ) { VERBOSE(2," " << inputWord << " " << inputWordHash->second); sum += inputWordHash->second; } alreadyScored.insert( &inputWord ); } } } // Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] ) VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl); score += FloorScore( log(1/(1+exp(-sum))) ); } return score; }
double CrossEntropyCounts::operator() ( const dlib::matrix<double,0,1>& arg) const { double total = 0.0; double n = 0.0; std::vector<float> weight_vector (m_model->m_numModels); for (int i=0; i < arg.nr(); i++) { weight_vector[i] = arg(i); } if (m_model->m_mode == "interpolate") { weight_vector = m_model->normalizeWeights(weight_vector); } for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) { multiModelCountsStatisticsOptimization* statistics = *iter; size_t f = statistics->f; double score; if (m_iFeature == 0) { score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector); } else if (m_iFeature == 1) { score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector); } else if (m_iFeature == 2) { score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector); } else if (m_iFeature == 3) { score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector); } else { score = 0; UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting"); } total -= (FloorScore(TransformScore(score))/TransformScore(2))*f; n += f; } return total/n; }
//TODO this should be a factory function! TranslationOption::TranslationOption(const WordsRange &wordsRange , const TargetPhrase &targetPhrase , const InputType &inputType , int /*whatever*/) : m_targetPhrase(targetPhrase) , m_sourceWordsRange (wordsRange) , m_futureScore(0) { const UnknownWordPenaltyProducer *up = StaticData::Instance().GetUnknownWordPenaltyProducer(); if (up) { const ScoreProducer *scoreProducer = (const ScoreProducer *)up; // not sure why none of the c++ cast works vector<float> score(1); score[0] = FloorScore(-numeric_limits<float>::infinity()); m_scoreBreakdown.Assign(scoreProducer, score); } if (inputType.GetType() == SentenceInput) { Phrase phrase = inputType.GetSubString(wordsRange); m_sourcePhrase = new Phrase(phrase); } else { // TODO lex reordering with confusion network m_sourcePhrase = new Phrase(*targetPhrase.GetSourcePhrase()); //the target phrase from a confusion network/lattice has input scores that we want to keep m_scoreBreakdown.PlusEquals(targetPhrase.GetScoreBreakdown()); } }
LMResult LanguageModelSRI::GetValue(VocabIndex wordId, VocabIndex *context) const { LMResult ret; ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context))); ret.unknown = (wordId == m_unknownId); return ret; }
LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & /*outState */) const { static WidMatrix widMatrix; for (int i=0; i<contextFactor.size(); i++) ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex)); for (size_t i = 0; i < contextFactor.size(); i++) { const Word &word = *contextFactor[i]; for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) { const Factor *factor = word[ m_factorTypesOrdered[j] ]; if (factor == NULL) widMatrix[i][j + 1] = 0; else widMatrix[i][j + 1] = GetLmID(factor, j); } if (widMatrix[i][1] == GetLmID(m_sentenceStartArray[0], 0) ) { widMatrix[i][0] = m_wtbid; } else if (widMatrix[i][1] == GetLmID(m_sentenceEndArray[0], 0 )) { widMatrix[i][0] = m_wteid; } else { widMatrix[i][0] = m_wtid; } } LMResult ret; ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() ); ret.score = FloorScore(TransformLMScore(ret.score)); ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId); return ret; /*if (contextFactor.size() == 0) { return 0; } for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) { const Word &word = *contextFactor[currPos]; for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { FactorType factorType = m_factorTypesOrdered[index]; const Factor *factor = word[factorType]; (*widMatrix)[currPos][index] = GetLmID(factor, index); } } float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder ); return FloorScore(TransformLMScore(p)); */ }
double CrossEntropy::operator() ( const dlib::matrix<double,0,1>& arg) const { double total = 0.0; double n = 0.0; std::vector<float> weight_vector (m_model->m_numModels); for (int i=0; i < arg.nr(); i++) { weight_vector[i] = arg(i); } if (m_model->m_mode == "interpolate") { weight_vector = m_model->normalizeWeights(weight_vector); } for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) { multiModelStatisticsOptimization* statistics = *iter; size_t f = statistics->f; double score; score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0); total -= (FloorScore(TransformScore(score))/TransformScore(2))*f; n += f; } return total/n; }
void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream& os) { size_t c = 0; float score; while(c < scores.size()) { score = scores[c]; score = FloorScore(TransformScore(score)); os.write((char*)&score, sizeof(score)); m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); c++; } }
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const { TargetPhraseCollection *ret = new TargetPhraseCollection(); for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelCountsStatistics * statistics = iter->second; if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) { UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables."); } try { pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm()); vector< set<size_t> > alignedToT = alignment.first; vector< set<size_t> > alignedToS = alignment.second; double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false ); double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true ); Scores scoreVector(5); scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0]))); scoreVector[1] = FloorScore(TransformScore(lexst)); scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2]))); scoreVector[3] = FloorScore(TransformScore(lexts)); scoreVector[4] = FloorScore(TransformScore(2.718)); statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); statistics->targetPhrase->Evaluate(src, GetFeaturesToApply()); } catch (AlignmentException& e) { continue; } ret->Add(new TargetPhrase(*statistics->targetPhrase)); } RemoveAllInMap(*allStats); delete allStats; return ret; }
//TODO this should be a factory function! TranslationOption::TranslationOption(const WordsRange &wordsRange , const TargetPhrase &targetPhrase , const InputType &inputType , const UnknownWordPenaltyProducer* up) : m_targetPhrase(targetPhrase) , m_sourceWordsRange (wordsRange) , m_futureScore(0) { if (up) { const ScoreProducer *scoreProducer = (const ScoreProducer *)up; // not sure why none of the c++ cast works vector<float> score(1); score[0] = FloorScore(-numeric_limits<float>::infinity()); m_scoreBreakdown.Assign(scoreProducer, score); } }
float LanguageModelInternal::GetValue(const Factor *factor0, State* finalState) const { float prob; const NGramNode *nGram = GetLmID(factor0); if (nGram == NULL) { if (finalState != NULL) *finalState = NULL; prob = -numeric_limits<float>::infinity(); } else { if (finalState != NULL) *finalState = static_cast<const void*>(nGram); prob = nGram->GetScore(); } return FloorScore(prob); }
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, State* finalState) const { float score; const NGramNode *nGram[2]; nGram[1] = GetLmID(factor1); if (nGram[1] == NULL) { if (finalState != NULL) *finalState = NULL; score = -numeric_limits<float>::infinity(); } else { nGram[0] = nGram[1]->GetNGram(factor0); if (nGram[0] == NULL) { // something unigram if (finalState != NULL) *finalState = static_cast<const void*>(nGram[1]); nGram[0] = GetLmID(factor0); if (nGram[0] == NULL) { // stops at unigram score = nGram[1]->GetScore(); } else { // unigram unigram score = nGram[1]->GetScore() + nGram[0]->GetLogBackOff(); } } else { // bigram if (finalState != NULL) *finalState = static_cast<const void*>(nGram[0]); score = nGram[0]->GetScore(); } } return FloorScore(score); }
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { FactorType factorType = GetFactorType(); // set up context randlm::WordID ngram[MAX_NGRAM_SIZE]; int count = contextFactor.size(); for (int i = 0 ; i < count ; i++) { ngram[i] = GetLmID((*contextFactor[i])[factorType]); //std::cerr << m_lm->getWord(ngram[i]) << " "; } int found = 0; LMResult ret; ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState))); ret.unknown = count && (ngram[count - 1] == m_oov_id); //if (finalState) // std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl; //else // std::cerr << " = " << logprob << std::endl; return ret; }
std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens) { std::string scoresString = tokens.back(); std::stringstream scoresStream; std::vector<float> scores; Tokenize<float>(scores, scoresString); if(!m_numScoreComponent) { m_numScoreComponent = scores.size(); m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin(); it != m_scoreCounters.end(); it++) *it = new ScoreCounter(); m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1); } if(m_numScoreComponent != scores.size()) { std::cerr << "Error: Wrong number of scores detected (" << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; std::cerr << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl; LOGE("[mgjang] before abort\n"); abort(); } size_t c = 0; float score; while(c < m_numScoreComponent) { score = scores[c]; score = FloorScore(TransformScore(score)); scoresStream.write((char*)&score, sizeof(score)); m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score); c++; } return scoresStream.str(); }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , size_t /* tableLimit */ , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); // source Phrase sourcePhrase; sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ruleTable, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing(); const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL); size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info const std::string& factorDelimiter = staticData.GetFactorDelimiter(); Phrase sourcePhrase(0); std::vector<float> scv; scv.reserve(m_numScoreComponent); TargetPhraseCollection *preSourceNode = NULL; std::string preSourceString; while(true) { ++line_num; StringPiece line; try { line = inFile.ReadLine(); } catch (util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||")); StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece scoreString(GrabOrDie(pipes, filePath, line_num)); bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t")); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n"); continue; } //target std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase()); targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter); scv.clear(); for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) { char *err_ind; // Token is always delimited by some form of space. Also, apparently strtod is portable but strtof isn't. scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind))))); if (err_ind == token->data()) { stringstream strme; strme << "Bad number " << token << " on line " << line_num; UserMessage::Add(strme.str()); abort(); } } if (scv.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } size_t consumed = 3; if (pipes) { targetPhrase->SetAlignmentInfo(*pipes++); ++consumed; } ScoreComponentCollection sparse; if (pipes) pipes++; //counts if (pipes) { //sparse features SparsePhraseDictionaryFeature* spdf = GetFeature()->GetSparsePhraseDictionaryFeature(); if (spdf) { sparse.Assign(spdf,(pipes++)->as_string()); } } // scv good to go sir! targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels); // Check number of entries delimited by ||| agrees across all lines. for (; pipes; ++pipes, ++consumed) {} if (numElement != consumed) { if (numElement == NOT_FOUND) { numElement = consumed; } else { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } } //TODO: Would be better to reuse source phrases, but ownership has to be //consistent across phrase table implementations sourcePhrase.Clear(); sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter); //Now that the source phrase is ready, we give the target phrase a copy targetPhrase->SetSourcePhrase(sourcePhrase); if (preSourceString == sourcePhraseString && preSourceNode) { preSourceNode->Add(targetPhrase.release()); } else { preSourceNode = CreateTargetPhraseCollection(sourcePhrase); preSourceNode->Add(targetPhrase.release()); preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size()); } } // sort each target phrase collection m_collection.Sort(m_tableLimit); /* // TODO ASK OLIVER WHY THIS IS NEEDED const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing(); */ return true; }
bool HyperTreeLoader::Load(AllOptions const& opts, const std::vector<FactorType> &input, const std::vector<FactorType> &output, const std::string &inFile, const RuleTableFF &ff, HyperTree &trie, boost::unordered_set<std::size_t> &sourceTermSet) { PrintUserTime(std::string("Start loading HyperTree")); sourceTermSet.clear(); std::size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables std::vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); HyperPathLoader hyperPathLoader; Phrase dummySourcePhrase; { Word *lhs = NULL; dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs); delete lhs; } while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourceString(*pipes); StringPiece targetString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } ++pipes; // counts scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const std::size_t numScoreComponents = ff.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } // Source-side HyperPath sourceFragment; hyperPathLoader.Load(sourceString, sourceFragment); ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet); // Target-side TargetPhrase *targetPhrase = new TargetPhrase(&ff); Word *targetLHS = NULL; targetPhrase->CreateFromString(Output, output, targetString, &targetLHS); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo(alignString); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ff, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); targetPhrase->EvaluateInIsolation(dummySourcePhrase, ff.GetFeaturesToApply()); // Add rule to trie. TargetPhraseCollection::shared_ptr phraseColl = GetOrCreateTargetPhraseCollection(trie, sourceFragment); phraseColl->Add(targetPhrase); count++; } // sort and prune each target phrase collection if (ff.GetTableLimit()) { SortAndPrune(trie, ff.GetTableLimit()); } return true; }
void GenerationDictionary::Load() { FactorCollection &factorCollection = FactorCollection::Instance(); const size_t numFeatureValuesInConfig = this->GetNumScoreComponents(); // data from file InputFileStream inFile(m_filePath); UTIL_THROW_IF2(!inFile.good(), "Couldn't read " << m_filePath); string line; size_t lineNum = 0; while(getline(inFile, line)) { ++lineNum; vector<string> token = Tokenize( line ); // add each line in generation file into class Word *inputWord = new Word(); // deleted in destructor Word outputWord; // create word with certain factors filled out // inputs vector<string> factorString = Tokenize( token[0], "|" ); for (size_t i = 0 ; i < GetInput().size() ; i++) { FactorType factorType = GetInput()[i]; const Factor *factor = factorCollection.AddFactor( Output, factorType, factorString[i]); inputWord->SetFactor(factorType, factor); } factorString = Tokenize( token[1], "|" ); for (size_t i = 0 ; i < GetOutput().size() ; i++) { FactorType factorType = GetOutput()[i]; const Factor *factor = factorCollection.AddFactor( Output, factorType, factorString[i]); outputWord.SetFactor(factorType, factor); } size_t numFeaturesInFile = token.size() - 2; if (numFeaturesInFile < numFeatureValuesInConfig) { stringstream strme; strme << m_filePath << ":" << lineNum << ": expected " << numFeatureValuesInConfig << " feature values, but found " << numFeaturesInFile << std::endl; throw strme.str(); } std::vector<float> scores(numFeatureValuesInConfig, 0.0f); for (size_t i = 0; i < numFeatureValuesInConfig; i++) scores[i] = FloorScore(TransformScore(Scan<float>(token[2+i]))); Collection::iterator iterWord = m_collection.find(inputWord); if (iterWord == m_collection.end()) { m_collection[inputWord][outputWord].Assign(this, scores); } else { // source word already in there. delete input word to avoid mem leak (iterWord->second)[outputWord].Assign(this, scores); delete inputWord; } } inFile.Close(); }
bool RuleTrieLoader::Load(const std::vector<FactorType> &input, const std::vector<FactorType> &output, const std::string &inFile, const RuleTableFF &ff, RuleTrie &trie) { PrintUserTime(std::string("Start loading text phrase table. Moses format")); const StaticData &staticData = StaticData::Instance(); // const std::string &factorDelimiter = staticData.GetFactorDelimiter(); std::size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables std::vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const std::size_t numScoreComponents = ff.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } // parse source & find pt node // constituent labels Word *sourceLHS = NULL; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(&ff); // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ff, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection( trie, *sourceLHS, sourcePhrase); phraseColl.Add(targetPhrase); // not implemented correctly in memory pt. just delete it for now delete sourceLHS; count++; } // sort and prune each target phrase collection if (ff.GetTableLimit()) { SortAndPrune(trie, ff.GetTableLimit()); } return true; }
LMResult LanguageModelRemote::GetValue(const std::vector<const Word*> &contextFactor, State* finalState) const { LMResult ret; ret.unknown = false; size_t count = contextFactor.size(); if (count == 0) { if (finalState) *finalState = NULL; ret.score = 0.0; return ret; } //std::cerr << "contextFactor.size() = " << count << "\n"; size_t max = m_nGramOrder; const FactorType factor = GetFactorType(); if (max > count) max = count; Cache* cur = &m_cache; int pc = static_cast<int>(count) - 1; for (int i = 0; i < pc; ++i) { const Factor* f = contextFactor[i]->GetFactor(factor); cur = &cur->tree[f ? f : BOS]; } const Factor* event_word = contextFactor[pc]->GetFactor(factor); cur = &cur->tree[event_word ? event_word : EOS]; if (cur->prob) { if (finalState) *finalState = cur->boState; ret.score = cur->prob; return ret; } cur->boState = *reinterpret_cast<const State*>(&m_curId); ++m_curId; std::ostringstream os; os << "prob "; if (event_word == NULL) { os << "</s>"; } else { os << event_word->GetString(); } for (size_t i=1; i<max; i++) { const Factor* f = contextFactor[count-1-i]->GetFactor(factor); if (f == NULL) { os << " <s>"; } else { os << ' ' << f->GetString(); } } os << std::endl; std::string out = os.str(); write(sock, out.c_str(), out.size()); char res[6]; int r = read(sock, res, 6); int errors = 0; int cnt = 0; while (1) { if (r < 0) { errors++; sleep(1); //std::cerr << "Error: read()\n"; if (errors > 5) exit(1); } else if (r==0 || res[cnt] == '\n') { break; } else { cnt += r; if (cnt==6) break; read(sock, &res[cnt], 6-cnt); } } cur->prob = FloorScore(TransformLMScore(*reinterpret_cast<float*>(res))); if (finalState) { *finalState = cur->boState; } ret.score = cur->prob; return ret; }
bool RuleTableLoaderCompact::LoadRuleSection( LineReader &reader, const std::vector<Word> &vocab, const std::vector<Phrase> &sourcePhrases, const std::vector<Phrase> &targetPhrases, const std::vector<size_t> &targetLhsIds, const std::vector<const AlignmentInfo *> &alignmentSets, RuleTableTrie &ruleTable) { // Read rule count. reader.ReadLine(); const size_t ruleCount = std::atoi(reader.m_line.c_str()); // Read rules and add to table. const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); std::vector<float> scoreVector(numScoreComponents); std::vector<size_t> tokenPositions; for (size_t i = 0; i < ruleCount; ++i) { reader.ReadLine(); tokenPositions.clear(); FindTokens(tokenPositions, reader.m_line); const char *charLine = reader.m_line.c_str(); // The first three tokens are IDs for the source phrase, target phrase, // and alignment set. const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]); const int targetPhraseId = std::atoi(charLine+tokenPositions[1]); const int alignmentSetId = std::atoi(charLine+tokenPositions[2]); const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId]; const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId]; const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]); Word sourceLHS("X"); // TODO not implemented for compact const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId]; // Then there should be one score for each score component. for (size_t j = 0; j < numScoreComponents; ++j) { float score = std::atof(charLine+tokenPositions[3+j]); scoreVector[j] = FloorScore(TransformScore(score)); } if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') { std::stringstream msg; msg << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << reader.m_lineNum; UserMessage::Add(msg.str()); return false; } // The remaining columns are currently ignored. // Create and score target phrase. TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase); targetPhrase->SetAlignNonTerm(alignNonTerm); targetPhrase->SetTargetLHS(targetLhs); targetPhrase->SetSourcePhrase(sourcePhrase); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); // Insert rule into table. TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection( ruleTable, sourcePhrase, *targetPhrase, &sourceLHS); coll.Add(targetPhrase); } return true; }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml * \param lbrackStr xml tag's left bracket string, typically "<" * \param rbrackStr xml tag's right bracket string, typically ">" */ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls, std::vector< std::pair<size_t, std::string> > &placeholders, int offset, const std::string& lbrackStr, const std::string& rbrackStr) { //parse XML markup in translation line const StaticData &staticData = StaticData::Instance(); // hack. What pt should XML trans opt be assigned to? PhraseDictionary *firstPt = NULL; if (PhraseDictionary::GetColl().size() == 0) { firstPt = PhraseDictionary::GetColl()[0]; } // no xml tag? we're done. //if (line.find_first_of('<') == string::npos) { if (line.find(lbrackStr) == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder(); // const string &factorDelimiter = staticData.GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); // special tag: wall if (tagName == "wall") { size_t start = (startPos == 0) ? 0 : startPos-1; for(size_t pos = start; pos < endPos; pos++) walls.push_back( pos ); } // special tag: zone else if (tagName == "zone") { if (startPos >= endPos) { TRACE_ERR("ERROR: zone must span at least one word: " << line << endl); return false; } reorderingConstraint.SetZone( startPos, endPos-1 ); } // name-entity placeholder else if (tagName == "ne") { if (startPos != (endPos - 1)) { TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl); return false; } string entity = ParseXmlTagAttribute(tagContent,"entity"); placeholders.push_back(std::pair<size_t, std::string>(startPos, entity)); } // update: add new aligned sentence pair to Mmsapt identified by name else if (tagName == "update") { #if PT_UG // get model name and aligned sentence pair string pdName = ParseXmlTagAttribute(tagContent,"name"); string source = ParseXmlTagAttribute(tagContent,"source"); string target = ParseXmlTagAttribute(tagContent,"target"); string alignment = ParseXmlTagAttribute(tagContent,"alignment"); // find PhraseDictionary by name const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl(); PhraseDictionary* pd = NULL; for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) { PhraseDictionary* curPd = *i; if (curPd->GetScoreProducerDescription() == pdName) { pd = curPd; break; } } if (pd == NULL) { TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl); return false; } // update model VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl); Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd); pdsa->add(source, target, alignment); #else TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl); return false; #endif } // weight-overwrite: update feature weights, unspecified weights remain unchanged // IMPORTANT: translation models that cache phrases or apply table-limit during load // based on initial weights need to be reset. Sending an empty update will do this // for PhraseDictionaryBitextSampling (Mmsapt) models: // <update name="TranslationModelName" source=" " target=" " alignment=" " /> else if (tagName == "weight-overwrite") { // is a name->ff map stored anywhere so we don't have to build it every time? const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions(); boost::unordered_map<string, FeatureFunction*> map; BOOST_FOREACH(FeatureFunction* const& ff, ffs) { map[ff->GetScoreProducerDescription()] = ff; } // update each weight listed ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights(); boost::unordered_map<string, FeatureFunction*>::iterator ffi; string ffName(""); vector<float> ffWeights; vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights")); BOOST_FOREACH(string const& tok, toks) { if (tok.substr(tok.size() - 1, 1) == "=") { // start new feature if (ffName != "") { // set previous feature weights if (ffi != map.end()) { allWeights.Assign(ffi->second, ffWeights); } ffWeights.clear(); } ffName = tok.substr(0, tok.size() - 1); ffi = map.find(ffName); if (ffi == map.end()) { TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl); } } else { // weight for current feature ffWeights.push_back(Scan<float>(tok)); } } if (ffi != map.end()) { allWeights.Assign(ffi->second, ffWeights); } StaticData::InstanceNonConst().SetAllWeights(allWeights); } // default: opening tag that specifies translation options else { if (startPos > endPos) { TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl); return false; } else if (startPos == endPos) { TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl); continue; } // specified translations -> vector of phrases // multiple translations may be specified, separated by "||" vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||"); if( altTexts.size() == 1 && altTexts[0] == "" ) altTexts.pop_back(); // happens when nothing specified // deal with legacy annotations: "translation" was called "english" vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||"); if (moreAltTexts.size()>1 || moreAltTexts[0] != "") { for(vector<string>::iterator translation=moreAltTexts.begin(); translation != moreAltTexts.end(); translation++) { string t = *translation; altTexts.push_back( t ); } } // specified probabilities for the translations -> vector of probs vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); if( altProbs.size() == 1 && altProbs[0] == "" ) altProbs.pop_back(); // happens when nothing specified // report what we have processed so far VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl); VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl); VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl); VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl); if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) { TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl); return false; } // store translation options into members if (staticData.GetXmlInputType() != XmlIgnore) { // only store options if we aren't ignoring them for (size_t i=0; i<altTexts.size(); ++i) { Phrase sourcePhrase; // TODO don't know what the source phrase is // set default probability float probValue = 1; if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]); // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase TargetPhrase targetPhrase(firstPt); // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL); targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL); // lhs const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); if (!lhsList.empty()) { const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true); Word *targetLHS = new Word(true); targetLHS->SetFactor(0, factor); // TODO - other factors too? targetPhrase.SetTargetLHS(targetLHS); } targetPhrase.SetXMLScore(scoreValue); targetPhrase.EvaluateInIsolation(sourcePhrase); XmlOption *option = new XmlOption(range,targetPhrase); assert(option); res.push_back(option); } altTexts.clear(); altProbs.clear(); } } } } }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml */ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) // keep this handy for later const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); if (startPos >= endPos) { TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl); return false; } // may be either a input span label ("label"), or a specified output translation "translation" string label = ParseXmlTagAttribute(tagContent,"label"); string translation = ParseXmlTagAttribute(tagContent,"translation"); // specified label if (translation.length() == 0 && label.length() > 0) { WordsRange range(startPos,endPos-1); // really? XMLParseOutput item(label, range); sourceLabels.push_back(item); } // specified translations -> vector of phrases, separated by "||" if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) { vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||"); vector<string> altLabel = TokenizeMultiCharSeparator(label, "||"); vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); //TRACE_ERR("number of translations: " << altTexts.size() << endl); for (size_t i=0; i<altTexts.size(); ++i) { // set target phrase TargetPhrase targetPhrase(Output); targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter); // set constituent label string targetLHSstr; if (altLabel.size() > i && altLabel[i].size() > 0) { targetLHSstr = altLabel[i]; } else { const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS = lhsList.begin(); targetLHSstr = iterLHS->first; } Word targetLHS(true); targetLHS.CreateFromString(Output, outputFactorOrder, targetLHSstr, true); CHECK(targetLHS.GetFactor(0) != NULL); targetPhrase.SetTargetLHS(targetLHS); // get probability float probValue = 1; if (altProbs.size() > i && altProbs[i].size() > 0) { probValue = Scan<float>(altProbs[i]); } // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); targetPhrase.SetScore(scoreValue); // set span and create XmlOption WordsRange range(startPos+1,endPos); XmlOption *option = new XmlOption(range,targetPhrase); CHECK(option); xmlOptions.push_back(option); VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl); } altTexts.clear(); altProbs.clear(); } } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl); return false; } // return de-xml'ed sentence in line line = cleanLine; return true; }
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) { // unknown word, add as trans opt const StaticData &staticData = StaticData::Instance(); const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); size_t isDigit = 0; if (staticData.GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface const StringPiece s = f->GetString(); isDigit = s.find_first_of("0123456789"); if (isDigit == string::npos) isDigit = 0; else isDigit = 1; // modify the starting bitmap } Phrase* unksrc = new Phrase(1); unksrc->AddWord() = sourceWord; Word &newWord = unksrc->GetWord(0); newWord.SetIsOOV(true); m_unksrcs.push_back(unksrc); //TranslationOption *transOpt; if (! staticData.GetDropUnknown() || isDigit) { // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; float prob = iterLHS->second; // lhs //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal(); Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); // scores float unknownScore = FloorScore(TransformScore(prob)); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } // for (iterLHS } else { // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits<float>::infinity()); TargetPhrase *targetPhrase = new TargetPhrase(); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; //float prob = iterLHS->second; Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } } }
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, const Factor *factor2, State* finalState) const { float score; const NGramNode *nGram[3]; nGram[2] = GetLmID(factor2); if (nGram[2] == NULL) { if (finalState != NULL) *finalState = NULL; score = -numeric_limits<float>::infinity(); } else { nGram[1] = nGram[2]->GetNGram(factor1); if (nGram[1] == NULL) { // something unigram if (finalState != NULL) *finalState = static_cast<const void*>(nGram[2]); nGram[1] = GetLmID(factor1); if (nGram[1] == NULL) { // stops at unigram score = nGram[2]->GetScore(); } else { nGram[0] = nGram[1]->GetNGram(factor0); if (nGram[0] == NULL) { // unigram unigram score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff(); } else { // unigram bigram score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff() + nGram[0]->GetLogBackOff(); } } } else { // trigram, or something bigram nGram[0] = nGram[1]->GetNGram(factor0); if (nGram[0] != NULL) { // trigram if (finalState != NULL) *finalState = static_cast<const void*>(nGram[0]); score = nGram[0]->GetScore(); } else { if (finalState != NULL) *finalState = static_cast<const void*>(nGram[1]); score = nGram[1]->GetScore(); nGram[1] = nGram[1]->GetRootNGram(); nGram[0] = nGram[1]->GetNGram(factor0); if (nGram[0] == NULL) { // just bigram // do nothing } else { score += nGram[0]->GetLogBackOff(); } } // else do nothing. just use 1st bigram } } return FloorScore(score); }