void KENLM<Model>::Load(System &system) { FactorCollection &fc = system.GetVocab(); m_bos = fc.AddFactor(BOS_, system, false); m_eos = fc.AddFactor(EOS_, system, false); lm::ngram::Config config; config.messages = NULL; FactorCollection &collection = system.GetVocab(); MappingBuilder builder(collection, system, m_lmIdLookup); config.enumerate_vocab = &builder; config.load_method = m_load_method; m_ngram.reset(new Model(m_path.c_str(), config)); }
void LanguageModel::Load(System &system) { FactorCollection &fc = system.GetVocab(); m_bos = fc.AddFactor(BOS_, system, false); m_eos = fc.AddFactor(EOS_, system, false); InputFileStream infile(m_path); size_t lineNum = 0; string line; while (getline(infile, line)) { if (++lineNum % 100000 == 0) { cerr << lineNum << " "; } vector<string> substrings = Tokenize(line, "\t"); if (substrings.size() < 2) continue; assert(substrings.size() == 2 || substrings.size() == 3); SCORE prob = TransformLMScore(Scan<SCORE>(substrings[0])); if (substrings[1] == "<unk>") { m_oov = prob; continue; } SCORE backoff = 0.f; if (substrings.size() == 3) { backoff = TransformLMScore(Scan<SCORE>(substrings[2])); } // ngram vector<string> key = Tokenize(substrings[1], " "); vector<const Factor*> factorKey(key.size()); for (size_t i = 0; i < key.size(); ++i) { factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false); } m_root.insert(factorKey, LMScores(prob, backoff)); } }