void WordMatchSearchImpl::query(const QString &req, QVector<Service::Item *> *res) const { QSet<Service::Item*>* resSet = nullptr; QStringList words = req.split(' ', QString::SkipEmptyParts); // Quit if there are no words in query if (words.empty()) return; for (QString &w : words) { InvertedIndex::const_iterator lb, ub; lb = std::lower_bound (_invertedIndex.cbegin(), _invertedIndex.cend(), w, CaseInsensitiveCompare()); ub = std::upper_bound (_invertedIndex.cbegin(), _invertedIndex.cend(), w, CaseInsensitiveComparePrefix()); QSet<Service::Item*> tmpSet; while (lb!=ub) tmpSet.unite(lb++->second); if (resSet == nullptr) resSet = new QSet<Service::Item*>(tmpSet); else resSet->intersect(tmpSet); } if (resSet != nullptr) { for (Service::Item *s : *resSet) res->append(s); delete resSet; } }
// ____________________________________________________________________________ TEST(SearchMainTest, Main) { string query = "nice website"; InvertedIndex ii; ii.buildFromCsvFile("./simple.csv"); QueryProcessor qp(ii); vector<string> result = qp.answerQuery(query, 3); ASSERT_EQ(1, result.size()); query = "nice nice"; result = qp.answerQuery(query, 3); ASSERT_EQ(2, result.size()); }
void WordMatchSearchImpl::prepare() { // Build inverted index typedef QMap<QString, QSet<Service::Item*>> InvertedIndexMap; InvertedIndexMap invertedIndexMap; for (Service::Item *i : _indexRef) { QStringList words = i->title().split(QRegExp("\\W+"), QString::SkipEmptyParts); for (QString &w : words) invertedIndexMap[w].insert(i); } // Convert back to vector for fast random access search algorithms for (InvertedIndexMap::const_iterator i = invertedIndexMap.cbegin(); i != invertedIndexMap.cend(); ++i) _invertedIndex.push_back(QPair<QString, QSet<Service::Item*>>(i.key(), i.value())); std::sort(_invertedIndex.begin(), _invertedIndex.end(), CaseInsensitiveCompare()); _invertedIndex.squeeze(); }
// Read every word in this file. // Insert a pair into the hash table (word,fname) void processFile (const char *fname, InvertedIndex& inverted_index) { ifstream inputfile; // ifstream for reading from input file. inputfile.open (fname); string fnames(fname); // file name as a string object, not as a char * (c-style string, which is an array of characters with \0 at the end). // Tokenize the input. // Read one character at a time. // If the character is not in a-z or A-Z, terminate current string. char c; char curr_str[MAX_STRING_LEN]; int str_i = 0; // Index into curr_str. bool flush_it = false; // Whether we have a complete string to flush. while (inputfile.good()) { // Read one character, convert it to lowercase. inputfile.get(c); c = tolower(c); if (c >= 'a' && c <= 'z') { // c is a letter. curr_str[str_i] = c; str_i++; // Check over-length string. if (str_i >= MAX_STRING_LEN) { flush_it = true; } } else { // c is not a letter. // Create a new string if curr_str is non-empty. if (str_i>0) { flush_it = true; } } if (flush_it) { // Create the new string from curr_str. string the_str(curr_str,str_i); // cout << the_str << endl; // Insert the string-file_name tuple into the inverted index. inverted_index.add(the_str,fnames); // cout << "Add " << the_str << "," << fname << endl; // Reset state variables. str_i = 0; flush_it = false; } } }
//#include "runlength.cpp" int main (){ //for (int j = 10000 ; j < 2000000;j+=10000) { int j = 0; float factor = 0.0; //for (float factor = 0.1;factor<3;factor+=0.2) { cout << "executing with factor = " << factor << endl; InvertedIndex *ii = new InvertedIndex("dataset/gov500.invlist","dataset/gov500.invlistfreq","dataset/gov500.words",500000,j,0); ii->buildTreap(); size_t f_size = ii->estimateFreqSorted(); vector<size_t> t = ii->estimateDocSorted(); for (uint i = 0 ; i < t.size();i++) { cout << "Estimated doc_id " << t[i] << endl; } cout << "Estimated Frequency = " << f_size << endl; delete ii; //} //} //testRunLength(); }
void testOrder() { InvertedIndex index; const Term term1("foo"); index.insert(term1, 0); index.insert(term1, 2); index.insert(term1, 1); index.insert(term1, 5); index.insert(term1, 4); index.insert(term1, 3); index.insert(term1, 6); const Postings &postings = index.getPostings(term1); CPPUNIT_ASSERT(postings.size() == 7); Postings::const_iterator iter = postings.begin(); DocId prevId = *iter; ++iter; for(; iter != postings.end(); ++iter) { CPPUNIT_ASSERT(prevId < *iter); prevId = *iter; } CPPUNIT_ASSERT(prevId == 6); }
inline void predict(std::vector<int> &results, size_t k, const fv_t &query) const { InvertedIndex::result_t knn; m_inverted_index.knn(knn, k, query); results.clear(); for (auto i = knn.begin(); i != knn.end(); ++i) { results.push_back(m_centroid_labels[i->id]); } }
void train(const category_index_t &category_index, const std::vector<fv_t> &data) { for (auto l = category_index.begin(); l != category_index.end(); ++l) { fv_t centroid; vector_sum(centroid, l->second, data); vector_normalize_l2(centroid); m_centroids.push_back(centroid); m_centroid_labels.push_back(l->first); } m_inverted_index.build(&m_centroids); }
void FuzzySearchImpl::buildIndex() { _invertedIndex.clear(); _qGramIndex.clear(); // Build inverted index for (Service::Item *item : _indexRef) { QStringList words = item->title().split(QRegExp("\\W+"), QString::SkipEmptyParts); for (QString &w : words) _invertedIndex[w.toLower()].insert(item); } // Build qGramIndex for (InvertedIndex::const_iterator it = _invertedIndex.cbegin(); it != _invertedIndex.cend(); ++it) { //Split the word into lowercase qGrams QString spaced = QString(_q-1,' ').append(it.key().toLower()); for (unsigned int i = 0 ; i < static_cast<unsigned int>(it.key().size()); ++i) // Increment #occurences of this qGram in this word ++_qGramIndex[spaced.mid(i,_q)][it.key()]; } }
float IDF_simple::operator() (const InvertedIndex &index, uint termId, uint listId, uint /*docId*/) const { float f_dt = index.invertedList()[termId][listId].second; return 1 + std::log(f_dt); }
float TF_simple::operator() (const InvertedIndex &index, uint termId) const { uint ft = index.ft()[termId]; return std::log(1 + index.numOfDocuments() / static_cast<float>(ft)); }
void Tokenizer::execute(wchar const * it , wchar const * endit , InvertedIndex & inverted_index , unsigned document_id) { Token token; auto not_character = character_.end(); auto not_delimiter = delimiter_.end(); while (it != endit) { // Read a sequence of 6 words size_t counter = 6 - token.size(); bool separated = false; for (; counter > 0; --counter) { // Eliminate all delimiters while (it != endit && character_.find(*it) == not_character) { if (delimiter_.find(*it) != not_delimiter) { // If it is a delimiter, eliminate all of 'em do { ++it; } while (it != endit && delimiter_.find(*it) != not_delimiter); // If it is a delimiter, it must not read more words // => There is no delimiter can present in the middle of a token separated = true; break; } ++it; } if (it == endit) { // End of file separated = true; break; } if (separated) if (!token.empty()) // Delimiters found => must not read more words break; else { // Delimiteres found => Token is still empty => continue separated = false; ++counter; continue; } wchar const * begin = it; do { // Read characters of a word in the token ++it; } while (it != endit && character_.find(*it) != not_character); wstring origin = wstring(begin , it - begin); size_t length = origin.length(); // Lowercase the word for (size_t i = 0; i < length; ++i) origin[i] = character_.at(origin[i]); token.push_back(origin); } do { size_t counter; while (true) { // Eliminate stop words counter = stopword_.max_match(token); if (counter == 0) break; else { do { token.pop_front(); --counter; } while (counter > 0); if (token.empty()) break; } } if (token.empty()) break; counter = vocabulary.max_match(token); if (counter > 0) { // If there is a token in vocabulary, insert it inverted_index.insert(token.cbegin() , token.cbegin() + counter , document_id); //token = Token(token.cbegin() + counter , token.cend()); do { token.pop_front(); --counter; } while (counter > 0); } else { // Otherwise, only insert the first word inverted_index.insert(token.cbegin() , token.cbegin() + 1 , document_id); //token = Token(token.cbegin() + 1 , token.cend()); token.pop_front(); } } while (separated); } }
// ___________________________________________________________________________ TEST(ApproximateMatching, init) { ii.buildFromCsvFile(mockupFileName); approximateMatching.init(ii, 5, '+'); EXPECT_EQ('+', approximateMatching.dummyChar()); EXPECT_EQ(5, approximateMatching.k()); }
void testInsert() { InvertedIndex index; const Term term1("foo"); index.insert(term1, 0); CPPUNIT_ASSERT(index.getPostings(term1).size() == 1); CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0)); index.insert(term1, 1); CPPUNIT_ASSERT(index.getPostings(term1).size() == 2); CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0)); CPPUNIT_ASSERT(isContain(index.getPostings(term1), 1)); CPPUNIT_ASSERT(!isContain(index.getPostings(term1), 2)); const Term term2("bar"); index.insert(term2, 0); CPPUNIT_ASSERT(index.getPostings(term2).size() == 1); CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0)); index.insert(term2, 1); CPPUNIT_ASSERT(index.getPostings(term2).size() == 2); CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0)); CPPUNIT_ASSERT(isContain(index.getPostings(term2), 1)); CPPUNIT_ASSERT(!isContain(index.getPostings(term2), 2)); }
void testIndexData() { /// Create Schema Schema *schema = Schema::create(srch2::instantsearch::DefaultIndex); schema->setPrimaryKey("article_id"); // integer, not searchable schema->setSearchableAttribute("article_id"); // convert id to searchable text schema->setSearchableAttribute("article_authors", 2); // searchable text schema->setSearchableAttribute("article_title", 7); // searchable text /// Create Analyzer SynonymContainer *syn = SynonymContainer::getInstance("", SYNONYM_DONOT_KEEP_ORIGIN); syn->init(); Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, syn, ""); /// Create IndexData string INDEX_DIR = "."; IndexData *indexData = IndexData::create(INDEX_DIR, analyzer, schema, srch2::instantsearch::DISABLE_STEMMER_NORMALIZER); Record *record = new Record(schema); record->setPrimaryKey(1001); record->setSearchableAttributeValue("article_authors", "Tom Smith and Jack Lennon"); record->setSearchableAttributeValue("article_title", "come Yesterday Once More"); record->setRecordBoost(10); indexData->_addRecord(record, analyzer); record->clear(); record->setPrimaryKey(1008); record->setSearchableAttributeValue(0, "Jimi Hendrix"); record->setSearchableAttributeValue(1, "Little wing"); record->setRecordBoost(90); indexData->_addRecord(record, analyzer); indexData->finishBulkLoad(); //index->print_Index(); record->clear(); record->setPrimaryKey(1007); record->setSearchableAttributeValue(0, "Jimaai Hendaarix"); record->setSearchableAttributeValue(1, "Littaale waaing"); record->setRecordBoost(90); indexData->_addRecord(record, analyzer); //index->print_Index(); /// test Trie Trie_Internal *trie = indexData->trie; typedef boost::shared_ptr<TrieRootNodeAndFreeList > TrieRootNodeSharedPtr; TrieRootNodeSharedPtr rootSharedPtr; trie->getTrieRootNode_ReadView(rootSharedPtr); TrieNode *root = rootSharedPtr->root; (void)(root); ASSERT( trie->getTrieNodeFromUtf8String( root, "and")->getId() < trie->getTrieNodeFromUtf8String( root, "come")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "come")->getId() < trie->getTrieNodeFromUtf8String( root, "hendrix")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "hendrix")->getId() < trie->getTrieNodeFromUtf8String( root, "jack")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "jack")->getId() < trie->getTrieNodeFromUtf8String( root, "jimi")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "jimi")->getId() < trie->getTrieNodeFromUtf8String( root, "lennon")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "lennon")->getId() < trie->getTrieNodeFromUtf8String( root, "little")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "little")->getId() < trie->getTrieNodeFromUtf8String( root, "more")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "more")->getId() < trie->getTrieNodeFromUtf8String( root, "once")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "once")->getId() < trie->getTrieNodeFromUtf8String( root, "smith")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "smith")->getId() < trie->getTrieNodeFromUtf8String( root, "tom")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "tom")->getId() < trie->getTrieNodeFromUtf8String( root, "wing")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "wing")->getId() < trie->getTrieNodeFromUtf8String( root, "yesterday")->getId() ); // we assume that there is no background thread does merge, // or even if there is such a background thread, it didn't have a chance to do the merge ASSERT( trie->getTrieNodeFromUtf8String( root, "jimaai") == NULL ); ASSERT( trie->getTrieNodeFromUtf8String( root, "Hendaarix") == NULL ); ASSERT( trie->getTrieNodeFromUtf8String( root, "Littaale") == NULL ); ASSERT( trie->getTrieNodeFromUtf8String( root, "waaing") == NULL ); ASSERT( trie->getTrieNodeFromUtf8String( root, "j")->getMinId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "j")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "jimi")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "ja")->getMinId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "ja")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "win")->getMinId() == trie->getTrieNodeFromUtf8String( root, "wing")->getId() ); ASSERT( trie->getTrieNodeFromUtf8String( root, "win")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "wing")->getId() ); /// test ForwardIndex ForwardIndex *forwardIndex = indexData->forwardIndex; shared_ptr<vectorview<ForwardListPtr> > forwardListDirectoryReadView; forwardIndex->getForwardListDirectory_ReadView(forwardListDirectoryReadView); float score = 0; unsigned keywordId = 1; // define the attributeBitmap only in debug mode #if ASSERT_LEVEL > 0 vector<unsigned> attributeBitmap; #endif ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 0, trie->getTrieNodeFromUtf8String( root, "jack")->getId(), trie->getTrieNodeFromUtf8String( root, "lennon")->getId(), vector<unsigned>(), ATTRIBUTES_OP_AND, keywordId, attributeBitmap, score) == true ); ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 0, trie->getTrieNodeFromUtf8String( root, "smith")->getId() + 1, trie->getTrieNodeFromUtf8String( root, "tom")->getId() - 1, vector<unsigned>(), ATTRIBUTES_OP_AND, keywordId, attributeBitmap, score) == false ); ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 1, trie->getTrieNodeFromUtf8String( root, "hendrix")->getId(), trie->getTrieNodeFromUtf8String( root, "jimi")->getId(), vector<unsigned>(), ATTRIBUTES_OP_AND, keywordId, attributeBitmap, score) == true ); ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 1, trie->getTrieNodeFromUtf8String( root, "wing")->getId() + 1, trie->getTrieNodeFromUtf8String( root, "wing")->getId() + 2, vector<unsigned>(), ATTRIBUTES_OP_AND, keywordId, attributeBitmap, score) == false ); /// test InvertedIndex InvertedIndex *invertedIndex = indexData->invertedIndex; (void)(forwardIndex); (void)(invertedIndex); (void)score; (void)keywordId; ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "and")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "come")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "hendrix")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "jack")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "jimi")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "lennon")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "little")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "more")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "once")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "smith")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "tom")->getInvertedListOffset() ) == 1); ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "wing")->getInvertedListOffset() ) == 1); delete schema; delete record; delete analyzer; delete indexData; syn->free(); }
bool load(const char *file) { FILE *fp = std::fopen(file, "rb"); if (fp == 0) { return false; } m_centroids.clear(); m_centroid_labels.clear(); m_inverted_index.clear(); size_t centroid_num = 0; size_t ret = std::fread(¢roid_num, sizeof(centroid_num), 1, fp); if (ret != 1) { std::fprintf(stderr, "%s: invalid format 1\n", file); fclose(fp); return false; } for (size_t i = 0; i < centroid_num; ++i) { fv_t centroid; size_t word_num = 0; ret = fread(&word_num, sizeof(word_num), 1, fp); if (ret != 1) { std::fprintf(stderr, "%s: invalid format 2\n", file); fclose(fp); return false; } for (size_t j = 0; j < word_num; ++j) { int word_id; float word_weight; ret = std::fread(&word_id, sizeof(word_id), 1, fp); if (ret != 1) { std::fprintf(stderr, "%s: invalid format 3\n", file); fclose(fp); return false; } ret = std::fread(&word_weight, sizeof(word_weight), 1, fp); if (ret != 1) { std::fprintf(stderr, "%s: invalid format 4\n", file); fclose(fp); return false; } centroid.insert(std::make_pair(word_id, word_weight)); } m_centroids.push_back(centroid); } ret = std::fread(¢roid_num, sizeof(centroid_num), 1, fp); if (ret != 1) { std::fprintf(stderr, "%s: invalid format 5\n", file); fclose(fp); return false; } int *buffer = new int[centroid_num]; ret = std::fread(buffer, sizeof(int), centroid_num, fp); if (ret != centroid_num) { std::fprintf(stderr, "%s: invalid format 6\n", file); delete buffer; fclose(fp); return false; } std::copy(buffer, buffer + centroid_num, std::back_inserter(m_centroid_labels)); delete buffer; fclose(fp); m_inverted_index.build(&m_centroids); return true; }