Example #1
0
// Sorts the vocabulary by frequency using word counts, frequent->infrequent
void Vocabulary::sortVocab()
{
  int a, size;
  unsigned int hash;
  // Sort the vocabulary and keep </s> at the first position
  qsort(&m_vocab[1], m_vocab_size - 1, sizeof(struct vocab_word_t), vocabCompare);
  //reduce words and re-hash
  for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1;
  size = m_vocab_size;
  m_train_words = 0;
  for (a = 0; a < size; a++)
  {
    // Words occuring less than min_count times will be discarded from the vocab
    if (m_vocab[a].cn < m_min_count)
    {
      m_vocab_size--;
      free(m_vocab[m_vocab_size].word);
      m_vocab[m_vocab_size].word = NULL;
    }
    else
    {
      // Hash will be re-computed, as after the sorting it is not actual
      hash = getWordHash(m_vocab[a].word);
      while (m_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
      m_vocab_hash[hash] = a;
      m_train_words += m_vocab[a].cn;
    }
  }
  m_train_words -= m_vocab[0].cn; //exclude <s>
  m_vocab = (struct vocab_word_t *)realloc(m_vocab, (m_vocab_size + 1) * sizeof(struct vocab_word_t));
}
Example #2
0
void sortVocab(){
    int a, size;
    unsigned int hash;
    //doesn't use </s>
    qsort(vocab, vocab_size, sizeof(vocabWord), VocabCompare);
    for(a=0;a<hash_size;a++){
        vocab_hash[a] = -1;
    }
    size = vocab_size;
    train_words=0;
    for(a=0;a<size;a++){
        if(vocab[a].cn < min_count){
            vocab_size--;
            free(vocab[a].word);
            vocab[a].word = (char*)0;
        }
        else{
            hash = getWordHash(vocab[a].word);
            while(vocab_hash[hash] != -1){
                vocab_hash[hash] = a;
                train_words += vocab[a].cn;
            }
        }
    }
    for(a=0;a<vocab_size;a++){
        vocab[a].code = (char*) calloc(MAX_CODE_LENGTH, sizeof(char));
        vocab[a].point = (int*) calloc(MAX_CODE_LENGTH, sizeof(int));
    }
}
Example #3
0
int addtoCurrentVocab(char word[]){
    unsigned int hash;
    unsigned int length = strlen(word) + 1;
    vocab[vocab_size].word = (char*) calloc(length, sizeof(char));
    strcpy(vocab[vocab_size].word, word);
    vocab[vocab_size].cn = 0;
    vocab_size++;
    hash = getWordHash(word);
    vocab_hash[hash] = vocab_size - 1;
    return vocab_size - 1;
}
Example #4
0
// Returns position of a word in the vocabulary; if the word is not found, returns -1
long long Vocabulary::searchVocab(const char *word)
{
  unsigned int hash = getWordHash(word);
  while (1)
  {
    if (m_vocab_hash[hash] == -1 || m_vocab[m_vocab_hash[hash]].word == NULL) return -1;
    if (!strcmp(word, m_vocab[m_vocab_hash[hash]].word)) return m_vocab_hash[hash];
    hash = (hash + 1) % vocab_hash_size;
  }
  return -1;
}
Example #5
0
  /* Returns position of a word in the vocabulary; if the word is not found, 
  * returns -1 */
  int distance::searchVocab(char *word) 
  {
      unsigned int hash = getWordHash(word);
      int l_b;
      while (1) 
      {
	  if (vocab_hash[hash] == -1) return -1;
// 	  if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
	  l_b = vocab_hash[hash];
	  if (!strcmp(&vocab[l_b * max_w], word)) return vocab_hash[hash];
	  hash = (hash + 1) % vocab_hash_size;
      }
      return -1;
  }
Example #6
0
int searchCurrentVocab(char word[]){ //uses linear probing approach for hash table
    unsigned int i;
    unsigned int hash = getWordHash(word);
    for(i = hash; 1; i = (i+1) % hash_size){
        if(vocab_hash[i] == -1){
            return -1;
        }
        if(!strcmp(word, vocab[vocab_hash[i]].word)){
            return vocab_hash[i];
        }
    }
    //shouldn't reach here ever
    return -1;
}
Example #7
0
long long Vocabulary::addWordToVocab(const char *word)
{
  unsigned int hash, length = strlen(word) + 1;
  if (length > MAX_STRING) length = MAX_STRING;
  m_vocab[m_vocab_size].word = (char *)calloc(length, sizeof(char));
  strcpy(m_vocab[m_vocab_size].word, word);
  m_vocab[m_vocab_size].cn = 0;
  m_vocab_size++;
  // Reallocate memory if needed
  if (m_vocab_size + 2 >= m_vocab_capacity)
  {
    m_vocab_capacity += 1000;
    m_vocab = (struct vocab_word_t *)realloc(m_vocab, m_vocab_capacity * sizeof(struct vocab_word_t));
  }
  hash = getWordHash(word);
  while (m_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
  m_vocab_hash[hash] = m_vocab_size - 1;
  return m_vocab_size - 1;
}
Example #8
0
// Reduces the vocabulary by removing infrequent tokens
void Vocabulary::reduceVocab()
{
  int a, b = 0;
  unsigned int hash;
  for (a = 0; a < m_vocab_size; a++) if (m_vocab[a].cn > m_min_reduce)
  {
    m_vocab[b].cn = m_vocab[a].cn;
    m_vocab[b].word = m_vocab[a].word;
    b++;
  } else free(m_vocab[a].word);
  m_vocab_size = b;
  for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1;
  for (a = 0; a < m_vocab_size; a++)
  {
    // Hash will be re-computed, as it is not actual
    hash = getWordHash(m_vocab[a].word);
    while (m_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
    m_vocab_hash[hash] = a;
  }
  fflush(stdout);
  m_min_reduce++;
}
Example #9
0
 /* Adds a word to the vocabulary */
 void distance::addWordToHash(char *word, int l_pos) {    
   unsigned int hash = getWordHash(word);
   while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
   vocab_hash[hash] = l_pos;
 }