float distance::getSimilarity(char * st1, char * st2) { // float vec1[max_size]; // float vec2[max_size]; // float len1=0; // float len2=0; int pos1 = -1; int pos2 = -1; b = 0; pos1 = searchVocab(st1); if (pos1 == -1) { return 0.0; } pos2 = searchVocab(st2); if (pos2 == -1) { return 0.0; } dist = 0; for (a = 0; a < size; a++) { dist += M[a + pos1 * size] * M[a + pos2 * size] ; } return dist; }
void Vocabulary::loadFromTrainFile(const char * train_file) { char * word; TaggedBrownCorpus corpus(train_file); long long a, i, k; for (a = 0; a < vocab_hash_size; a++) m_vocab_hash[a] = -1; m_vocab_size = 0; if(!m_doctag) addWordToVocab((char *)"</s>"); TaggedDocument * doc = NULL; while ((doc = corpus.next()) != NULL) { if(m_doctag) { //for doc tag word = doc->m_tag; m_train_words++; i = searchVocab(word); if (i == -1) { a = addWordToVocab(word); m_vocab[a].cn = 1; } } else { // for doc words for(k = 0; k < doc->m_word_num; k++){ word = doc->m_words[k]; m_train_words++; if (!m_doctag && m_train_words % 100000 == 0) { printf("%lldK%c", m_train_words / 1000, 13); fflush(stdout); } i = searchVocab(word); if (i == -1) { a = addWordToVocab(word); m_vocab[a].cn = 1; } else m_vocab[i].cn++; if (m_vocab_size > vocab_hash_size * 0.7) reduceVocab(); } m_train_words--; } } if(!m_doctag) { sortVocab(); printf("Vocab size: %lld\n", m_vocab_size); printf("Words in train file: %lld\n", m_train_words); } }
// output features to files void OUTProcedure() { // get the vocab map<string, int>* vocab = &(InvertedIndex::instance()->vocab); multimap<int, int>::const_iterator doc_ent_it = InvertedIndex::instance()->doc_entities.begin(); // go through all the document_entities for(; doc_ent_it != InvertedIndex::instance()->doc_entities.end(); ++doc_ent_it) { // go through all the entities multimap<int, int>::const_iterator ent_it = InvertedIndex::instance()->entities.begin((*doc_ent_it).second); cout << "Entity: "; for(; ent_it != InvertedIndex::instance()->entities.end((*doc_ent_it).second); ++ent_it) { // map the entities into words using vocab cout << searchVocab((*ent_it).second, vocab) << " "; } // go through all the syns multimap<int, int>::const_iterator ent_syn_it = InvertedIndex::instance()->entity_syns.begin((*doc_ent_it).second); for(; ent_syn_it != InvertedIndex::instance()->entity_syns.end((*doc_ent_it).second); ++ent_syn_it) { cout << "Syn: "; multimap<int, int>::const_iterator syn_it = InvertedIndex::instance()->syns.begin((*ent_syn_it).second); for(; syn_it != InvertedIndex::instance()->syns.end((*ent_syn_it).second); ++syn_it) { // map the syns into words using vocab cout << searchVocab((*syn_it).second, vocab) << " "; } } // get the weight of that entity multimap<int, int>::const_iterator w_it = InvertedIndex::instance()->entity_weights.begin((*doc_ent_it).second); cout << "weight: "; for(; w_it != InvertedIndex::instance()->entity_weights.end((*doc_ent_it).second); ++w_it) { // output the weights cout << (*w_it).second << " "; } cout << "\n"; } }