// Sorts the vocabulary by frequency using word counts, frequent->infrequent void Vocabulary::sortVocab() { int a, size; unsigned int hash; // Sort the vocabulary and keep </s> at the first position qsort(&m_vocab[1], m_vocab_size - 1, sizeof(struct vocab_word_t), vocabCompare); //reduce words and re-hash for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1; size = m_vocab_size; m_train_words = 0; for (a = 0; a < size; a++) { // Words occuring less than min_count times will be discarded from the vocab if (m_vocab[a].cn < m_min_count) { m_vocab_size--; free(m_vocab[m_vocab_size].word); m_vocab[m_vocab_size].word = NULL; } else { // Hash will be re-computed, as after the sorting it is not actual hash = getWordHash(m_vocab[a].word); while (m_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; m_vocab_hash[hash] = a; m_train_words += m_vocab[a].cn; } } m_train_words -= m_vocab[0].cn; //exclude <s> m_vocab = (struct vocab_word_t *)realloc(m_vocab, (m_vocab_size + 1) * sizeof(struct vocab_word_t)); }
void sortVocab(){ int a, size; unsigned int hash; //doesn't use </s> qsort(vocab, vocab_size, sizeof(vocabWord), VocabCompare); for(a=0;a<hash_size;a++){ vocab_hash[a] = -1; } size = vocab_size; train_words=0; for(a=0;a<size;a++){ if(vocab[a].cn < min_count){ vocab_size--; free(vocab[a].word); vocab[a].word = (char*)0; } else{ hash = getWordHash(vocab[a].word); while(vocab_hash[hash] != -1){ vocab_hash[hash] = a; train_words += vocab[a].cn; } } } for(a=0;a<vocab_size;a++){ vocab[a].code = (char*) calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[a].point = (int*) calloc(MAX_CODE_LENGTH, sizeof(int)); } }
int addtoCurrentVocab(char word[]){ unsigned int hash; unsigned int length = strlen(word) + 1; vocab[vocab_size].word = (char*) calloc(length, sizeof(char)); strcpy(vocab[vocab_size].word, word); vocab[vocab_size].cn = 0; vocab_size++; hash = getWordHash(word); vocab_hash[hash] = vocab_size - 1; return vocab_size - 1; }
// Returns position of a word in the vocabulary; if the word is not found, returns -1 long long Vocabulary::searchVocab(const char *word) { unsigned int hash = getWordHash(word); while (1) { if (m_vocab_hash[hash] == -1 || m_vocab[m_vocab_hash[hash]].word == NULL) return -1; if (!strcmp(word, m_vocab[m_vocab_hash[hash]].word)) return m_vocab_hash[hash]; hash = (hash + 1) % vocab_hash_size; } return -1; }
/* Returns position of a word in the vocabulary; if the word is not found, * returns -1 */ int distance::searchVocab(char *word) { unsigned int hash = getWordHash(word); int l_b; while (1) { if (vocab_hash[hash] == -1) return -1; // if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; l_b = vocab_hash[hash]; if (!strcmp(&vocab[l_b * max_w], word)) return vocab_hash[hash]; hash = (hash + 1) % vocab_hash_size; } return -1; }
int searchCurrentVocab(char word[]){ //uses linear probing approach for hash table unsigned int i; unsigned int hash = getWordHash(word); for(i = hash; 1; i = (i+1) % hash_size){ if(vocab_hash[i] == -1){ return -1; } if(!strcmp(word, vocab[vocab_hash[i]].word)){ return vocab_hash[i]; } } //shouldn't reach here ever return -1; }
long long Vocabulary::addWordToVocab(const char *word) { unsigned int hash, length = strlen(word) + 1; if (length > MAX_STRING) length = MAX_STRING; m_vocab[m_vocab_size].word = (char *)calloc(length, sizeof(char)); strcpy(m_vocab[m_vocab_size].word, word); m_vocab[m_vocab_size].cn = 0; m_vocab_size++; // Reallocate memory if needed if (m_vocab_size + 2 >= m_vocab_capacity) { m_vocab_capacity += 1000; m_vocab = (struct vocab_word_t *)realloc(m_vocab, m_vocab_capacity * sizeof(struct vocab_word_t)); } hash = getWordHash(word); while (m_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; m_vocab_hash[hash] = m_vocab_size - 1; return m_vocab_size - 1; }
// Reduces the vocabulary by removing infrequent tokens void Vocabulary::reduceVocab() { int a, b = 0; unsigned int hash; for (a = 0; a < m_vocab_size; a++) if (m_vocab[a].cn > m_min_reduce) { m_vocab[b].cn = m_vocab[a].cn; m_vocab[b].word = m_vocab[a].word; b++; } else free(m_vocab[a].word); m_vocab_size = b; for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1; for (a = 0; a < m_vocab_size; a++) { // Hash will be re-computed, as it is not actual hash = getWordHash(m_vocab[a].word); while (m_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; m_vocab_hash[hash] = a; } fflush(stdout); m_min_reduce++; }
/* Adds a word to the vocabulary */ void distance::addWordToHash(char *word, int l_pos) { unsigned int hash = getWordHash(word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = l_pos; }