예제 #1
0
void MonolingualModel::readVocab(const string& training_file) {
    ifstream infile(training_file);

    if (!infile.is_open()) {
        throw runtime_error("couldn't open file " + training_file);
    }

    vocabulary.clear();

    string word;
    while (infile >> word) {
        addWordToVocab(word);
    }

    if (config.verbose)
        cout << "Vocabulary size: " << vocabulary.size() << endl;

    reduceVocab();

    if (config.verbose)
        cout << "Reduced vocabulary size: " << vocabulary.size() << endl;

    training_words = 0; // count number of words in the training file
    for (auto it = vocabulary.begin(); it != vocabulary.end(); ++it) {
        training_words += it->second.count;
    }

    createBinaryTree();
    initUnigramTable();
}
예제 #2
0
void Vocabulary::loadFromTrainFile(const char * train_file)
{
  char * word;
  TaggedBrownCorpus corpus(train_file);
  long long a, i, k;
  for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1;
  m_vocab_size = 0;
  if(!m_doctag) addWordToVocab((char *)"</s>");
  TaggedDocument * doc = NULL;
  while ((doc = corpus.next()) != NULL) {
    if(m_doctag) {  //for doc tag
      word = doc->m_tag;
      m_train_words++;
      i = searchVocab(word);
      if (i == -1) {
        a = addWordToVocab(word);
        m_vocab[a].cn = 1;
      }
    } else { // for doc words
      for(k = 0; k < doc->m_word_num; k++){
        word = doc->m_words[k];
        m_train_words++;
        if (!m_doctag && m_train_words % 100000 == 0)
        {
          printf("%lldK%c", m_train_words / 1000, 13);
          fflush(stdout);
        }
        i = searchVocab(word);
        if (i == -1) {
          a = addWordToVocab(word);
          m_vocab[a].cn = 1;
        } else m_vocab[i].cn++;
        if (m_vocab_size > vocab_hash_size * 0.7) reduceVocab();
      }
      m_train_words--;
    }
  }
  if(!m_doctag)
  {
    sortVocab();
    printf("Vocab size: %lld\n", m_vocab_size);
    printf("Words in train file: %lld\n", m_train_words);
  }
}