Exemplo n.º 1
0
  float distance::getSimilarity(char * st1, char * st2)
  {
//       float vec1[max_size];
//       float vec2[max_size];
//       float len1=0;
//       float len2=0;
      int pos1 = -1; 
      int pos2 = -1;
      b = 0;
      pos1 = searchVocab(st1);
      if (pos1 == -1) 
      {
	return 0.0;
      }
      pos2 = searchVocab(st2);
      if (pos2 == -1) 
      {
	return 0.0;
      }
      dist = 0;
      for (a = 0; a < size; a++) 
      {
	  dist += M[a + pos1 * size] * M[a + pos2 * size] ;
      }
      return dist;
  }
Exemplo n.º 2
0
void Vocabulary::loadFromTrainFile(const char * train_file)
{
  char * word;
  TaggedBrownCorpus corpus(train_file);
  long long a, i, k;
  for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1;
  m_vocab_size = 0;
  if(!m_doctag) addWordToVocab((char *)"</s>");
  TaggedDocument * doc = NULL;
  while ((doc = corpus.next()) != NULL) {
    if(m_doctag) {  //for doc tag
      word = doc->m_tag;
      m_train_words++;
      i = searchVocab(word);
      if (i == -1) {
        a = addWordToVocab(word);
        m_vocab[a].cn = 1;
      }
    } else { // for doc words
      for(k = 0; k < doc->m_word_num; k++){
        word = doc->m_words[k];
        m_train_words++;
        if (!m_doctag && m_train_words % 100000 == 0)
        {
          printf("%lldK%c", m_train_words / 1000, 13);
          fflush(stdout);
        }
        i = searchVocab(word);
        if (i == -1) {
          a = addWordToVocab(word);
          m_vocab[a].cn = 1;
        } else m_vocab[i].cn++;
        if (m_vocab_size > vocab_hash_size * 0.7) reduceVocab();
      }
      m_train_words--;
    }
  }
  if(!m_doctag)
  {
    sortVocab();
    printf("Vocab size: %lld\n", m_vocab_size);
    printf("Words in train file: %lld\n", m_train_words);
  }
}
Exemplo n.º 3
0
// output features to files
void OUTProcedure() {

  // get the vocab
  map<string, int>* vocab = &(InvertedIndex::instance()->vocab);

  multimap<int, int>::const_iterator doc_ent_it = InvertedIndex::instance()->doc_entities.begin();
  // go through all the document_entities
  for(; doc_ent_it != InvertedIndex::instance()->doc_entities.end(); ++doc_ent_it) {
    // go through all the entities
    multimap<int, int>::const_iterator ent_it = InvertedIndex::instance()->entities.begin((*doc_ent_it).second);
    cout << "Entity: ";
    for(; ent_it != InvertedIndex::instance()->entities.end((*doc_ent_it).second); ++ent_it) {
      // map the entities into words using vocab
      cout << searchVocab((*ent_it).second, vocab) << " ";
    }
    // go through all the syns
    multimap<int, int>::const_iterator ent_syn_it = InvertedIndex::instance()->entity_syns.begin((*doc_ent_it).second);
    for(; ent_syn_it != InvertedIndex::instance()->entity_syns.end((*doc_ent_it).second); ++ent_syn_it) {
      cout << "Syn: ";
      multimap<int, int>::const_iterator syn_it = InvertedIndex::instance()->syns.begin((*ent_syn_it).second);
      for(; syn_it != InvertedIndex::instance()->syns.end((*ent_syn_it).second); ++syn_it) {
	// map the syns into words using vocab
	cout << searchVocab((*syn_it).second, vocab) << " ";
      }
    }
    // get the weight of that entity
    multimap<int, int>::const_iterator w_it = InvertedIndex::instance()->entity_weights.begin((*doc_ent_it).second);
    cout << "weight: ";
    for(; w_it != InvertedIndex::instance()->entity_weights.end((*doc_ent_it).second); ++w_it) {
      // output the weights
      cout << (*w_it).second << " ";
    }      
    cout << "\n"; 
  }
  
}