bool OnDiskWrapper::OpenForLoad(const std::string &filePath) { m_fileSource.open((filePath + "/Source.dat").c_str(), ios::in | ios::binary); UTIL_THROW_IF(!m_fileSource.is_open(), util::FileOpenException, "Couldn't open file " << filePath << "/Source.dat"); m_fileTargetInd.open((filePath + "/TargetInd.dat").c_str(), ios::in | ios::binary); UTIL_THROW_IF(!m_fileTargetInd.is_open(), util::FileOpenException, "Couldn't open file " << filePath << "/TargetInd.dat"); m_fileTargetColl.open((filePath + "/TargetColl.dat").c_str(), ios::in | ios::binary); UTIL_THROW_IF(!m_fileTargetColl.is_open(), util::FileOpenException, "Couldn't open file " << filePath << "/TargetColl.dat"); m_fileVocab.open((filePath + "/Vocab.dat").c_str(), ios::in); UTIL_THROW_IF(!m_fileVocab.is_open(), util::FileOpenException, "Couldn't open file " << filePath << "/Vocab.dat"); m_fileMisc.open((filePath + "/Misc.dat").c_str(), ios::in); UTIL_THROW_IF(!m_fileMisc.is_open(), util::FileOpenException, "Couldn't open file " << filePath << "/Misc.dat"); // set up root node LoadMisc(); m_numSourceFactors = GetMisc("NumSourceFactors"); m_numTargetFactors = GetMisc("NumTargetFactors"); m_numScores = GetMisc("NumScores"); return true; }
void TargetPhrase::SetAlignmentInfo(const StringPiece &alignString) { AlignmentInfo::CollType alignTerm, alignNonTerm; for (util::TokenIter<util::AnyCharacter, true> token(alignString, util::AnyCharacter(" \t")); token; ++token) { util::TokenIter<util::SingleCharacter, false> dash(*token, util::SingleCharacter('-')); char *endptr; size_t sourcePos = strtoul(dash->data(), &endptr, 10); UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash); ++dash; size_t targetPos = strtoul(dash->data(), &endptr, 10); UTIL_THROW_IF(endptr != dash->data() + dash->size(), util::ErrnoException, "Error parsing alignment" << *dash); UTIL_THROW_IF(++dash, util::Exception, "Extra gunk in alignment " << *token); if (GetWord(targetPos).IsNonTerminal()) { alignNonTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos)); } else { alignTerm.insert(std::pair<size_t,size_t>(sourcePos, targetPos)); } } SetAlignTerm(alignTerm); SetAlignNonTerm(alignNonTerm); }
float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vector<float>& bg) { // Sum sent and background UTIL_THROW_IF(sent.size()!=bg.size(), util::Exception, "Error"); UTIL_THROW_IF(sent.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); std::vector<float> stats(sent.size()); for(size_t i=0; i<sent.size(); i++) stats[i] = sent[i]+bg[i]; // Calculate BLEU float logbleu = 0.0; for (int j = 0; j < kBleuNgramOrder; j++) { logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]); } logbleu /= kBleuNgramOrder; const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1]; if (brevity < 0.0) { logbleu += brevity; } // Exponentiate and scale by reference length (as per Chiang et al 08) return exp(logbleu) * stats[kBleuNgramOrder*2]; }
void ScoreFeatureManager::configure(const std::vector<std::string> args) { bool domainAdded = false; bool sparseDomainAdded = false; for (size_t i = 0; i < args.size(); ++i) { if (args[i] == "--IgnoreSentenceId") { m_includeSentenceId = true; } else if (args[i].substr(0,8) == "--Domain") { string type = args[i].substr(8); ++i; UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file"); string domainFile = args[i]; UTIL_THROW_IF(domainAdded, ScoreFeatureArgumentException, "Only allowed one domain feature"); if (type == "Subset") { m_features.push_back(ScoreFeaturePtr(new SubsetDomainFeature(domainFile))); } else if (type == "Ratio") { m_features.push_back(ScoreFeaturePtr(new RatioDomainFeature(domainFile))); } else if (type == "Indicator") { m_features.push_back(ScoreFeaturePtr(new IndicatorDomainFeature(domainFile))); } else { UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type); } domainAdded = true; m_includeSentenceId = true; } else if (args[i].substr(0,14) == "--SparseDomain") { string type = args[i].substr(14); ++i; UTIL_THROW_IF(i == args.size(), ScoreFeatureArgumentException, "Missing domain file"); string domainFile = args[i]; UTIL_THROW_IF(sparseDomainAdded, ScoreFeatureArgumentException, "Only allowed one sparse domain feature"); if (type == "Subset") { m_features.push_back(ScoreFeaturePtr(new SparseSubsetDomainFeature(domainFile))); } else if (type == "Ratio") { m_features.push_back(ScoreFeaturePtr(new SparseRatioDomainFeature(domainFile))); } else if (type == "Indicator") { m_features.push_back(ScoreFeaturePtr(new SparseIndicatorDomainFeature(domainFile))); } else { UTIL_THROW(ScoreFeatureArgumentException, "Unknown domain feature type " << type); } sparseDomainAdded = true; m_includeSentenceId = true; } else if(args[i] == "--GHKMFeatureSparse"){ //MARIA m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureSparse())); } else if(args[i] == "--GHKMFeatureDense"){ //MARIA m_features.push_back(ScoreFeaturePtr(new InternalStructFeatureDense())); } else { UTIL_THROW(ScoreFeatureArgumentException,"Unknown score argument " << args[i]); } } }
void WordTranslationFeature::Load() { // load word list for restricted feature set if (m_filePathSource.empty()) { return; } //else if (tokens.size() == 8) { cerr << "loading word translation word lists from " << m_filePathSource << " and " << m_filePathTarget << endl; if (m_domainTrigger) { // domain trigger terms for each input document ifstream inFileSource(m_filePathSource.c_str()); UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource); std::string line; while (getline(inFileSource, line)) { m_vocabDomain.resize(m_vocabDomain.size() + 1); vector<string> termVector; boost::split(termVector, line, boost::is_any_of("\t ")); for (size_t i=0; i < termVector.size(); ++i) m_vocabDomain.back().insert(termVector[i]); } inFileSource.close(); } else { // restricted source word vocabulary ifstream inFileSource(m_filePathSource.c_str()); UTIL_THROW_IF(!inFileSource, util::Exception, "could not open file " << m_filePathSource); std::string line; while (getline(inFileSource, line)) { m_vocabSource.insert(line); } inFileSource.close(); // restricted target word vocabulary ifstream inFileTarget(m_filePathTarget.c_str()); UTIL_THROW_IF(!inFileTarget, util::Exception, "could not open file " << m_filePathTarget); while (getline(inFileTarget, line)) { m_vocabTarget.insert(line); } inFileTarget.close(); m_unrestricted = false; } }
/** * Pre-calculate the n-gram probabilities for the words in the specified phrase. * * Note that when this method is called, we do not have access to the context * in which this phrase will eventually be applied. * * In other words, we know what words are in this phrase, * but we do not know what words will come before or after this phrase. * * The parameters fullScore, ngramScore, and oovCount are all output parameters. * * The value stored in oovCount is the number of words in the phrase * that are not in the language model's vocabulary. * * The sum of the ngram scores for all words in this phrase are stored in fullScore. * * The value stored in ngramScore is similar, but only full-order ngram scores are included. * * This is best shown by example: * * Assume a trigram backward language model and a phrase "a b c d e f g" * * fullScore would represent the sum of the logprob scores for the following values: * * p(g) * p(f | g) * p(e | g f) * p(d | f e) * p(c | e d) * p(b | d c) * p(a | c b) * * ngramScore would represent the sum of the logprob scores for the following values: * * p(g) * p(f | g) * p(e | g f) * p(d | f e) * p(c | e d) * p(b | d c) * p(a | c b) */ template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { fullScore = 0; ngramScore = 0; oovCount = 0; if (!phrase.GetSize()) return; lm::ngram::ChartState discarded_sadly; lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly); UTIL_THROW_IF( (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)), util::Exception, "BackwardLanguageModel does not currently support rules that include <s>" ); float before_boundary = 0.0f; int lastWord = phrase.GetSize() - 1; int ngramBoundary = m_ngram->Order() - 1; int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary; int position; for (position = lastWord; position >= 0; position-=1) { const Word &word = phrase.GetWord(position); UTIL_THROW_IF( (word.IsNonTerminal()), util::Exception, "BackwardLanguageModel does not currently support rules that include non-terminals " ); lm::WordIndex index = TranslateID(word); scorer.Terminal(index); if (!index) ++oovCount; if (position==boundary) { before_boundary = scorer.Finish(); } } fullScore = scorer.Finish(); ngramScore = TransformLMScore(fullScore - before_boundary); fullScore = TransformLMScore(fullScore); }
size_t ThrowingFwrite(const void *ptr, size_t size, size_t count, FILE* stream) { assert(size); size_t returnValue = std::fwrite(ptr, size, count, stream); UTIL_THROW_IF(count != returnValue, util::ErrnoException, "Short fwrite; requested size " << size); return returnValue; }
void Word::ConvertToMoses( const std::vector<Moses::FactorType> &outputFactorsVec, const Vocab &vocab, Moses::Word &overwrite) const { Moses::FactorCollection &factorColl = Moses::FactorCollection::Instance(); overwrite = Moses::Word(m_isNonTerminal); // TODO: this conversion should have been done at load time. util::TokenIter<util::SingleCharacter> tok(vocab.GetString(m_vocabId), '|'); for (std::vector<Moses::FactorType>::const_iterator t = outputFactorsVec.begin(); t != outputFactorsVec.end(); ++t, ++tok) { UTIL_THROW_IF(!tok, util::Exception, "Too few factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size()); overwrite.SetFactor(*t, factorColl.AddFactor(*tok)); } UTIL_THROW_IF(tok, util::Exception, "Too many factors in \"" << vocab.GetString(m_vocabId) << "\"; was expecting " << outputFactorsVec.size()); }
void LanguageModelDALM::Load() { ///////////////////// // READING INIFILE // ///////////////////// string model; // Path to the double-array file. string words; // Path to the vocabulary file. string wordstxt; //Path to the vocabulary file in text format. read_ini(m_filePath.c_str(), model, words, wordstxt); UTIL_THROW_IF(model.empty() || words.empty() || wordstxt.empty(), util::FileOpenException, "Failed to read DALM ini file " << m_filePath << ". Probably doesn't exist"); //////////////// // LOADING LM // //////////////// // Preparing a logger object. m_logger = new DALM::Logger(stderr); m_logger->setLevel(DALM::LOGGER_INFO); // Load the vocabulary file. m_vocab = new DALM::Vocabulary(words, *m_logger); // Load the language model. m_lm = new DALM::LM(model, *m_vocab, *m_logger); wid_start = m_vocab->lookup(BOS_); wid_end = m_vocab->lookup(EOS_); // vocab mapping CreateVocabMapping(wordstxt); }
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig, vector<Data>& shards) { UTIL_THROW_IF(shard_count == 0, util::Exception, "Must have at least 1 shard"); UTIL_THROW_IF(shard_size < 0 || shard_size > 1, util::Exception, "Shard size must be between 0 and 1, inclusive. Currently " << shard_size); size_t data_size = m_score_data->size(); UTIL_THROW_IF(data_size != m_feature_data->size(), util::Exception, "Error"); shard_size *= data_size; const float coeff = static_cast<float>(data_size) / shard_count; for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) { vector<size_t> shard_contents; if (shard_size == 0) { //split into roughly equal size shards const size_t shard_start = floor(0.5 + shard_id * coeff); const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff); for (size_t i = shard_start; i < shard_end; ++i) { shard_contents.push_back(i); } } else { //create shards by randomly sampling for (size_t i = 0; i < floor(shard_size+0.5); ++i) { shard_contents.push_back(rand() % data_size); } } Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig); shards.push_back(Data(scorer)); shards.back().m_score_type = m_score_type; shards.back().m_num_scores = m_num_scores; for (size_t i = 0; i < shard_contents.size(); ++i) { shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i])); shards.back().m_score_data->add(m_score_data->get(shard_contents[i])); } //cerr << endl; } }
void Word:: CreateFromString(FactorDirection direction , const std::vector<FactorType> &factorOrder , const StringPiece &str , bool isNonTerminal , bool strict) { FactorCollection &factorCollection = FactorCollection::Instance(); vector<StringPiece> bits(MAX_NUM_FACTORS); string factorDelimiter = StaticData::Instance().GetFactorDelimiter(); if (factorDelimiter.size()) { util::TokenIter<util::MultiCharacter> fit(str, factorDelimiter); size_t i = 0; for (; i < MAX_NUM_FACTORS && fit; ++i,++fit) bits[i] = *fit; if (i == MAX_NUM_FACTORS) UTIL_THROW_IF(fit, StrayFactorException, "The hard limit for factors is " << MAX_NUM_FACTORS << ". The word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times."); if (strict) UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times."); UTIL_THROW_IF(!isNonTerminal && i < factorOrder.size(),util::Exception, "Too few factors in string '" << str << "'."); } else { bits[0] = str; } for (size_t k = 0; k < factorOrder.size(); ++k) { UTIL_THROW_IF(factorOrder[k] >= MAX_NUM_FACTORS, util::Exception, "Factor order out of bounds."); m_factorArray[factorOrder[k]] = factorCollection.AddFactor(bits[k], isNonTerminal); } // assume term/non-term same for all factors m_isNonTerminal = isNonTerminal; }
void SparseHieroReorderingFeature::LoadVocabulary(const std::string& filename, Vocab& vocab) { if (filename.empty()) return; ifstream in(filename.c_str()); UTIL_THROW_IF(!in, util::Exception, "Unable to open vocab file: " << filename); string line; while(getline(in,line)) { vocab.insert(FactorCollection::Instance().AddFactor(line)); } in.close(); }
UINT64 OnDiskWrapper::GetMisc(const std::string &key) const { std::map<std::string, UINT64>::const_iterator iter; iter = m_miscInfo.find(key); UTIL_THROW_IF(iter == m_miscInfo.end() , util::Exception , "Couldn't find value for key " << key ); return iter->second; }
ChartTrellisPath::ChartTrellisPath(const ChartTrellisDetour &detour) : m_finalNode(new ChartTrellisNode(detour, m_deviationPoint)) , m_scoreBreakdown(detour.GetBasePath().m_scoreBreakdown) , m_totalScore(0) { UTIL_THROW_IF(m_deviationPoint == NULL, util::Exception, "No deviation point"); ScoreComponentCollection scoreChange; scoreChange = detour.GetReplacementHypo().GetScoreBreakdown(); scoreChange.MinusEquals(detour.GetSubstitutedNode().GetHypothesis().GetScoreBreakdown()); m_scoreBreakdown.PlusEquals(scoreChange); m_totalScore = m_scoreBreakdown.GetWeightedScore(); }
SparseReordering::SparseReordering(const map<string,string>& config, const LexicalReordering* producer) : m_producer(producer) { static const string kSource= "source"; static const string kTarget = "target"; for (map<string,string>::const_iterator i = config.begin(); i != config.end(); ++i) { vector<string> fields = Tokenize(i->first, "-"); if (fields[0] == "words") { UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering word list name should be sparse-words-(source|target)-<id>"); if (fields[1] == kSource) { ReadWordList(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceWordLists); } else if (fields[1] == kTarget) { ReadWordList(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetWordLists); } else { UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]); } } else if (fields[0] == "clusters") { UTIL_THROW_IF(!(fields.size() == 3), util::Exception, "Sparse reordering cluster name should be sparse-clusters-(source|target)-<id>"); if (fields[1] == kSource) { ReadClusterMap(i->second,fields[2], SparseReorderingFeatureKey::Source, &m_sourceClusterMaps); } else if (fields[1] == kTarget) { ReadClusterMap(i->second,fields[2],SparseReorderingFeatureKey::Target, &m_targetClusterMaps); } else { UTIL_THROW(util::Exception, "Sparse reordering requires source or target, not " << fields[1]); } } else if (fields[0] == "phrase") { m_usePhrase = true; } else if (fields[0] == "stack") { m_useStack = true; } else if (fields[0] == "between") { m_useBetween = true; } else { UTIL_THROW(util::Exception, "Unable to parse sparse reordering option: " << i->first); } } }
void SparseReordering::ReadWordList(const string& filename, const string& id, SparseReorderingFeatureKey::Side side, vector<WordList>* pWordLists) { ifstream fh(filename.c_str()); UTIL_THROW_IF(!fh, util::Exception, "Unable to open: " << filename); string line; pWordLists->push_back(WordList()); pWordLists->back().first = id; while (getline(fh,line)) { //TODO: StringPiece const Factor* factor = FactorCollection::Instance().AddFactor(line); pWordLists->back().second.insert(factor); PreCalculateFeatureNames(pWordLists->size()-1, id, side, factor, false); } }
Word *OnDiskWrapper::ConvertFromMoses(const std::vector<Moses::FactorType> &factorsVec , const Moses::Word &origWord) const { bool isNonTerminal = origWord.IsNonTerminal(); Word *newWord = new Word(isNonTerminal); stringstream strme; size_t factorType = factorsVec[0]; const Moses::Factor *factor = origWord.GetFactor(factorType); UTIL_THROW_IF(factor == NULL, util::Exception, "Expecting factor " << factorType); strme << factor->GetString(); for (size_t ind = 1 ; ind < factorsVec.size() ; ++ind) { size_t factorType = factorsVec[ind]; const Moses::Factor *factor = origWord.GetFactor(factorType); if (factor == NULL) { // can have less factors than factorType.size() break; } UTIL_THROW_IF(factor == NULL, util::Exception, "Expecting factor " << factorType << " at position " << ind); strme << "|" << factor->GetString(); } // for (size_t factorType bool found; UINT64 vocabId = m_vocab.GetVocabId(strme.str(), found); if (!found) { // factor not in phrase table -> phrse definately not in. exit delete newWord; return NULL; } else { newWord->SetVocabId(vocabId); return newWord; } }
void Word::CreateFromString(FactorDirection direction , const std::vector<FactorType> &factorOrder , const StringPiece &str , bool isNonTerminal) { FactorCollection &factorCollection = FactorCollection::Instance(); util::TokenIter<util::MultiCharacter> fit(str, StaticData::Instance().GetFactorDelimiter()); for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) { m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit); } UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times."); // assume term/non-term same for all factors m_isNonTerminal = isNonTerminal; }
float unsmoothedBleu(const std::vector<float>& stats) { UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); float logbleu = 0.0; for (int j = 0; j < kBleuNgramOrder; j++) { logbleu += log(stats[2 * j]) - log(stats[2 * j + 1]); } logbleu /= kBleuNgramOrder; const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1]; if (brevity < 0.0) { logbleu += brevity; } return exp(logbleu); }
void OnDiskWrapper::EndSave() { bool ret = m_rootSourceNode->Saved(); UTIL_THROW_IF(!ret, util::Exception, "Root node not saved"); GetVocab().Save(*this); SaveMisc(); m_fileMisc.close(); m_fileVocab.close(); m_fileSource.close(); m_fileTarget.close(); m_fileTargetInd.close(); m_fileTargetColl.close(); }
template <class Model> FFState *BackwardLanguageModel<Model>::Evaluate(const Phrase &phrase, const FFState *ps, float &returnedScore) const { returnedScore = 0.0f; const lm::ngram::ChartState &previous = static_cast<const BackwardLMState&>(*ps).state; std::auto_ptr<BackwardLMState> ret(new BackwardLMState()); lm::ngram::RuleScore<Model> scorer(*m_ngram, ret->state); int ngramBoundary = m_ngram->Order() - 1; int lastWord = phrase.GetSize() - 1; // Get scores for words at the end of the previous phrase // that are now adjacent to words at the the beginning of this phrase for (int position=std::min( lastWord, ngramBoundary - 1); position >= 0; position-=1) { const Word &word = phrase.GetWord(position); UTIL_THROW_IF( (word.IsNonTerminal()), util::Exception, "BackwardLanguageModel does not currently support rules that include non-terminals " ); lm::WordIndex index = TranslateID(word); scorer.Terminal(index); } scorer.NonTerminal(previous); returnedScore = scorer.Finish(); /* out->PlusEquals(this, score); UTIL_THROW_IF( (1==1), util::Exception, "This method (BackwardLanguageModel<Model>::Evaluate) is not yet fully implemented" ); */ return ret.release(); }
template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &) { std::vector<lm::WordIndex> words; UTIL_THROW_IF(phrase.GetSize() > 1, util::Exception, "OOV target phrase should be 0 or 1 word in length"); if (phrase.GetSize()) words.push_back(Convert(phrase.GetWord(0))); search::PartialEdge edge(edges_.AllocateEdge(0)); // Appears to be a bug that FutureScore does not already include language model. search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between())); edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast<search::Score>(scored.oov) * oov_weight_); search::Note note; note.vp = &phrase; edge.SetNote(note); edges_.AddEdge(edge); }
float smoothedSentenceBleu (const std::vector<float>& stats, float smoothing, bool smoothBP) { UTIL_THROW_IF(stats.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); float logbleu = 0.0; for (int j = 0; j < kBleuNgramOrder; j++) { logbleu += log(stats[2 * j] + smoothing) - log(stats[2 * j + 1] + smoothing); } logbleu /= kBleuNgramOrder; const float reflength = stats[(kBleuNgramOrder * 2)] + (smoothBP ? smoothing : 0.0f); const float brevity = 1.0 - reflength / stats[1]; if (brevity < 0.0) { logbleu += brevity; } return exp(logbleu); }
void SubsetDomainFeature::add(const map<string,float>& domainCount, float count, const MaybeLog& maybeLog, std::vector<float>& denseValues, std::map<std::string,float>& sparseValues) const { if (m_domain.list.size() > 6) { UTIL_THROW_IF(m_domain.list.size() > 6, ScoreFeatureArgumentException, "too many domains for core domain subset features"); } size_t bitmap = 0; for(size_t bit = 0; bit < m_domain.list.size(); bit++) { if (domainCount.find( m_domain.list[ bit ] ) != domainCount.end()) { bitmap += 1 << bit; } } for(size_t i = 1; i < (1 << m_domain.list.size()); i++) { denseValues.push_back(maybeLog( (bitmap == i) ? 2.718 : 1 )); } }
statscore_t BleuScorer::calculateScore(const vector<int>& comps) const { UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error"); float logbleu = 0.0; for (int i = 0; i < kBleuNgramOrder; ++i) { if (comps[2*i] == 0) { return 0.0; } logbleu += log(comps[2*i]) - log(comps[2*i+1]); } logbleu /= kBleuNgramOrder; // reflength divided by test length const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1]; if (brevity < 0.0) { logbleu += brevity; } return exp(logbleu); }
void DynSuffixArray::Insert(vuint_t* newSent, unsigned newIndex) { // for sentences //stages 1, 2, 4 stay same from 1char case //(use last word of new text in step 2 and save Ltmp until last insert?) //stage 3...all words of new sentence are inserted backwards // stage 2: k=ISA[newIndex], tmp= L[k], L[k] = newChar //PrintAuxArrays(); UTIL_THROW_IF(newIndex > m_SA->size(), util::Exception, "Error"); int k(-1), kprime(-1); k = (newIndex < m_SA->size() ? m_ISA->at(newIndex) : m_ISA->at(0)); // k is now index of the cycle that starts at newindex int true_pos = LastFirstFunc(k); // track cycle shift (newIndex - 1) int Ltmp = m_L->at(k); m_L->at(k) = newSent->at(newSent->size()-1); // cycle k now ends with correct word for(int j = newSent->size()-1; j > -1; --j) { kprime = LastFirstFunc(k); // find cycle that starts with (newindex - 1) //kprime += ((m_L[k] == Ltmp) && (k > isa[k]) ? 1 : 0); // yada yada // only terminal char can be 0 so add new vocab at end kprime = (kprime > 0 ? kprime : m_SA->size()); true_pos += (kprime <= true_pos ? 1 : 0); // track changes // insert everything m_F->insert(m_F->begin() + kprime, newSent->at(j)); int theLWord = (j == 0 ? Ltmp : newSent->at(j-1)); m_L->insert(m_L->begin() + kprime, theLWord); for (vuint_t::iterator itr = m_SA->begin(); itr != m_SA->end(); ++itr) { if(*itr >= newIndex) ++(*itr); } m_SA->insert(m_SA->begin() + kprime, newIndex); for (vuint_t::iterator itr = m_ISA->begin(); itr != m_ISA->end(); ++itr) { if((int)*itr >= kprime) ++(*itr); } m_ISA->insert(m_ISA->begin() + newIndex, kprime); k = kprime; //PrintAuxArrays(); } // Begin stage 4 Reorder(true_pos, LastFirstFunc(kprime)); // actual position vs computed position of cycle (newIndex-1) }
void DynSuffixArray::Reorder(unsigned j, unsigned jprime) { set<pair<unsigned, unsigned> > seen; while(j != jprime) { // this 'seenit' check added for data with many loops. will remove after double // checking. bool seenit = seen.insert(std::make_pair(j, jprime)).second; if(seenit) { for(size_t i=1; i < m_SA->size(); ++i) { if(m_corpus->at(m_SA->at(i)) < m_corpus->at(m_SA->at(i-1))) { cerr << "PROBLEM WITH SUFFIX ARRAY REORDERING. EXITING...\n"; exit(1); } } return; } //cerr << "j=" << j << "\tj'=" << jprime << endl; int isaIdx(-1); int new_j = LastFirstFunc(j); UTIL_THROW_IF(j > jprime, util::Exception, "Error"); // for SA and L, the element at pos j is moved to pos j' m_L->insert(m_L->begin() + jprime + 1, m_L->at(j)); m_L->erase(m_L->begin() + j); m_SA->insert(m_SA->begin() + jprime + 1, m_SA->at(j)); m_SA->erase(m_SA->begin() + j); // all ISA values between (j...j'] decremented for(size_t i = 0; i < m_ISA->size(); ++i) { if((m_ISA->at(i) == j) && (isaIdx == -1)) isaIdx = i; // store index of ISA[i] = j if((m_ISA->at(i) > j) && (m_ISA->at(i) <= jprime)) --(*m_ISA)[i]; } // replace j with j' in ISA //isa[isaIdx] = jprime; m_ISA->at(isaIdx) = jprime; j = new_j; jprime = LastFirstFunc(jprime); } //cerr << "j=" << j << "\tj'=" << jprime << endl; }
void FeatureFunction::ParseLine(const std::string &line) { vector<string> toks = Tokenize(line); CHECK(toks.size()); string nameStub = toks[0]; set<string> keys; for (size_t i = 1; i < toks.size(); ++i) { vector<string> args = TokenizeFirstOnly(toks[i], "="); CHECK(args.size() == 2); pair<set<string>::iterator,bool> ret = keys.insert(args[0]); UTIL_THROW_IF(!ret.second, util::Exception, "Duplicate key in line " << line); if (args[0] == "num-features") { m_numScoreComponents = Scan<size_t>(args[1]); } else if (args[0] == "name") { m_description = args[1]; } else { m_args.push_back(args); } } // name if (m_description == "") { size_t index = description_counts.count(nameStub); ostringstream dstream; dstream << nameStub; dstream << index; description_counts.insert(nameStub); m_description = dstream.str(); } }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , const std::vector<float> &weight , size_t /* tableLimit */ , const LMList &languageModels , const WordPenaltyProducer* wpProducer , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString(*++pipes); // TODO(bhaddow) efficiently handle default instead of parsing this string every time. StringPiece ruleCountString = ++pipes ? *pipes : StringPiece("1 1"); if (++pipes) { stringstream strme; strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count; UserMessage::Add(strme.str()); abort(); } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { char *err_ind; scoreVector.push_back(strtod(s->data(), &err_ind)); UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count); } const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word sourceLHS, targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); targetPhrase->SetSourcePhrase(sourcePhrase); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString, sourcePhrase); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetRuleCount(ruleCountString, scoreVector[0]); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , size_t /* tableLimit */ , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); // source Phrase sourcePhrase; sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ruleTable, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }