void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection) // add factors which have randlm id { // code copied & paste from SRI LM class. should do template function // first get all bf vocab in map std::map<size_t, randlm::WordID> randlm_ids_map; // map from factor id -> randlm id size_t maxFactorId = 0; // to create lookup vector later on for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart(); vIter != m_lm->vocabEnd(); vIter++) { // get word from randlm vocab and associate with (new) factor id size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId(); randlm_ids_map[factorId] = vIter->second; maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; } // add factors for BOS and EOS and store bf word ids size_t factorId; m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS()); factorId = m_sentenceStart->GetId(); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceStartWord[m_factorType] = m_sentenceStart; m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS()); factorId = m_sentenceEnd->GetId(); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceEndWord[m_factorType] = m_sentenceEnd; // add to lookup vector in object m_randlm_ids_vec.resize(maxFactorId+1); // fill with OOV code fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id); for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin(); iter != randlm_ids_map.end() ; ++iter) m_randlm_ids_vec[iter->first] = iter->second; }
void Word::CreateFromString(FactorCollection &vocab, const System &system, const std::string &str) { vector<string> toks; if (str[0] == '[' && str[str.size() - 1] == ']') { isNonTerminal = true; size_t startPos = str.find("[", 1); bool doubleNT = startPos != string::npos; if (doubleNT) { assert(startPos != string::npos); string str2 = str.substr(startPos + 1, str.size() - startPos - 2); toks = Tokenize(str2, "|"); } else { string str2 = str.substr(1, str.size() - 2); toks = Tokenize(str2, "|"); } } else { isNonTerminal = false; toks = Tokenize(str, "|"); } // parse string for (size_t i = 0; i < toks.size(); ++i) { const string &tok = toks[i]; //cerr << "tok=" << tok << endl; const Factor *factor = vocab.AddFactor(tok, system, isNonTerminal); m_factors[i] = factor; } }
void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection) { // add factors which have srilm id // code copied & paste from SRI LM class. should do template function std::map<size_t, int> lmIdMap; size_t maxFactorId = 0; // to create lookup vector later on dict_entry *entry; dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags while ( (entry = iter.next()) != NULL) { size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId(); lmIdMap[factorId] = entry->code; maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; } size_t factorId; m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_); factorId = m_sentenceStart->GetId(); m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceStartArray[m_factorType] = m_sentenceStart; m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); factorId = m_sentenceEnd->GetId(); m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; m_sentenceEndArray[m_factorType] = m_sentenceEnd; // add to lookup vector in object m_lmIdLookup.resize(maxFactorId+1); fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId); map<size_t, int>::iterator iterMap; for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) { m_lmIdLookup[iterMap->first] = iterMap->second; } }
void Word::CreateFromString(FactorCollection &vocab, const System &system, const std::string &str) { vector<string> toks = Tokenize(str, "|"); for (size_t i = 0; i < toks.size(); ++i) { const string &tok = toks[i]; //cerr << "tok=" << tok << endl; const Factor *factor = vocab.AddFactor(tok, system, false); m_factors[i] = factor; } // null the rest for (size_t i = toks.size(); i < MAX_NUM_FACTORS; ++i) { m_factors[i] = NULL; } }