void MonolingualModel::readVocab(const string& training_file) { ifstream infile(training_file); if (!infile.is_open()) { throw runtime_error("couldn't open file " + training_file); } vocabulary.clear(); string word; while (infile >> word) { addWordToVocab(word); } if (config.verbose) cout << "Vocabulary size: " << vocabulary.size() << endl; reduceVocab(); if (config.verbose) cout << "Reduced vocabulary size: " << vocabulary.size() << endl; training_words = 0; // count number of words in the training file for (auto it = vocabulary.begin(); it != vocabulary.end(); ++it) { training_words += it->second.count; } createBinaryTree(); initUnigramTable(); }
void Vocabulary::loadFromTrainFile(const char * train_file) { char * word; TaggedBrownCorpus corpus(train_file); long long a, i, k; for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1; m_vocab_size = 0; if(!m_doctag) addWordToVocab((char *)"</s>"); TaggedDocument * doc = NULL; while ((doc = corpus.next()) != NULL) { if(m_doctag) { //for doc tag word = doc->m_tag; m_train_words++; i = searchVocab(word); if (i == -1) { a = addWordToVocab(word); m_vocab[a].cn = 1; } } else { // for doc words for(k = 0; k < doc->m_word_num; k++){ word = doc->m_words[k]; m_train_words++; if (!m_doctag && m_train_words % 100000 == 0) { printf("%lldK%c", m_train_words / 1000, 13); fflush(stdout); } i = searchVocab(word); if (i == -1) { a = addWordToVocab(word); m_vocab[a].cn = 1; } else m_vocab[i].cn++; if (m_vocab_size > vocab_hash_size * 0.7) reduceVocab(); } m_train_words--; } } if(!m_doctag) { sortVocab(); printf("Vocab size: %lld\n", m_vocab_size); printf("Words in train file: %lld\n", m_train_words); } }