Beispiel #1
0
void LanguageModelRandLM::CreateFactors(FactorCollection &factorCollection)   // add factors which have randlm id
{
    // code copied & paste from SRI LM class. should do template function
    // first get all bf vocab in map
    std::map<size_t, randlm::WordID> randlm_ids_map; // map from factor id -> randlm id
    size_t maxFactorId = 0; // to create lookup vector later on
    for(std::map<randlm::Word, randlm::WordID>::const_iterator vIter = m_lm->vocabStart();
            vIter != m_lm->vocabEnd(); vIter++) {
        // get word from randlm vocab and associate with (new) factor id
        size_t factorId=factorCollection.AddFactor(Output,m_factorType,vIter->first)->GetId();
        randlm_ids_map[factorId] = vIter->second;
        maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
    }
    // add factors for BOS and EOS and store bf word ids
    size_t factorId;
    m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, m_lm->getBOS());
    factorId = m_sentenceStart->GetId();
    maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
    m_sentenceStartWord[m_factorType] = m_sentenceStart;

    m_sentenceEnd	= factorCollection.AddFactor(Output, m_factorType, m_lm->getEOS());
    factorId = m_sentenceEnd->GetId();
    maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
    m_sentenceEndWord[m_factorType] = m_sentenceEnd;

    // add to lookup vector in object
    m_randlm_ids_vec.resize(maxFactorId+1);
    // fill with OOV code
    fill(m_randlm_ids_vec.begin(), m_randlm_ids_vec.end(), m_oov_id);

    for (map<size_t, randlm::WordID>::const_iterator iter = randlm_ids_map.begin();
            iter != randlm_ids_map.end() ; ++iter)
        m_randlm_ids_vec[iter->first] = iter->second;

}
Beispiel #2
0
void Word::CreateFromString(FactorCollection &vocab,
                            const System &system,
                            const std::string &str)
{
  vector<string> toks;

  if (str[0] == '[' && str[str.size() - 1] == ']') {
    isNonTerminal = true;

    size_t startPos = str.find("[", 1);
    bool doubleNT = startPos != string::npos;

    if (doubleNT) {
      assert(startPos != string::npos);
      string str2 = str.substr(startPos + 1, str.size() - startPos - 2);
      toks = Tokenize(str2, "|");
    } else {
      string str2 = str.substr(1, str.size() - 2);
      toks = Tokenize(str2, "|");
    }
  } else {
    isNonTerminal = false;
    toks = Tokenize(str, "|");
  }

  // parse string
  for (size_t i = 0; i < toks.size(); ++i) {
    const string &tok = toks[i];
    //cerr << "tok=" << tok << endl;

    const Factor *factor = vocab.AddFactor(tok, system, isNonTerminal);
    m_factors[i] = factor;
  }
}
Beispiel #3
0
void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection)
{ // add factors which have srilm id
	// code copied & paste from SRI LM class. should do template function
	std::map<size_t, int> lmIdMap;
	size_t maxFactorId = 0; // to create lookup vector later on
	
	dict_entry *entry;
	dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags
	while ( (entry = iter.next()) != NULL)
	{
		size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId();
		lmIdMap[factorId] = entry->code;
		maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
	}
	
	size_t factorId;
	
	m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
	factorId = m_sentenceStart->GetId();
	m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_);
	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
	m_sentenceStartArray[m_factorType] = m_sentenceStart;

	m_sentenceEnd		= factorCollection.AddFactor(Output, m_factorType, EOS_);
	factorId = m_sentenceEnd->GetId();
	m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_);
	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
	m_sentenceEndArray[m_factorType] = m_sentenceEnd;
	
	// add to lookup vector in object
	m_lmIdLookup.resize(maxFactorId+1);
	
	fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);

	map<size_t, int>::iterator iterMap;
	for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
	{
		m_lmIdLookup[iterMap->first] = iterMap->second;
	}
  
  
}
Beispiel #4
0
void Word::CreateFromString(FactorCollection &vocab, const System &system,
    const std::string &str)
{
  vector<string> toks = Tokenize(str, "|");
  for (size_t i = 0; i < toks.size(); ++i) {
    const string &tok = toks[i];
    //cerr << "tok=" << tok << endl;
    const Factor *factor = vocab.AddFactor(tok, system, false);
    m_factors[i] = factor;
  }

  // null the rest
  for (size_t i = toks.size(); i < MAX_NUM_FACTORS; ++i) {
	m_factors[i] = NULL;
  }
}