Example #1
0
void WordMatchSearchImpl::query(const QString &req, QVector<Service::Item *> *res) const
{
	QSet<Service::Item*>* resSet = nullptr;
	QStringList words = req.split(' ', QString::SkipEmptyParts);

	// Quit if there are no words in query
	if (words.empty())
		return;

	for (QString &w : words)
	{
		InvertedIndex::const_iterator lb, ub;
		lb =  std::lower_bound (_invertedIndex.cbegin(), _invertedIndex.cend(), w, CaseInsensitiveCompare());
		ub =  std::upper_bound (_invertedIndex.cbegin(), _invertedIndex.cend(), w, CaseInsensitiveComparePrefix());
		QSet<Service::Item*> tmpSet;
		while (lb!=ub)
			tmpSet.unite(lb++->second);
		if (resSet == nullptr)
			resSet = new QSet<Service::Item*>(tmpSet);
		else
			resSet->intersect(tmpSet);
	}
	if (resSet != nullptr) {
		for (Service::Item *s : *resSet)
			res->append(s);
		delete resSet;
	}
}
// ____________________________________________________________________________
TEST(SearchMainTest, Main) {
    string query = "nice website";
    InvertedIndex ii;
    ii.buildFromCsvFile("./simple.csv");

    QueryProcessor qp(ii);
    vector<string> result = qp.answerQuery(query, 3);
    ASSERT_EQ(1, result.size());
    query = "nice nice";
    result = qp.answerQuery(query, 3);
    ASSERT_EQ(2, result.size());
}
Example #3
0
void WordMatchSearchImpl::prepare()
{
	// Build inverted index
	typedef QMap<QString, QSet<Service::Item*>> InvertedIndexMap;
	InvertedIndexMap invertedIndexMap;
	for (Service::Item *i : _indexRef)
	{
		QStringList words = i->title().split(QRegExp("\\W+"), QString::SkipEmptyParts);
		for (QString &w : words)
			invertedIndexMap[w].insert(i);
	}

	// Convert back to vector for fast random access search algorithms
	for (InvertedIndexMap::const_iterator i = invertedIndexMap.cbegin(); i != invertedIndexMap.cend(); ++i)
		_invertedIndex.push_back(QPair<QString, QSet<Service::Item*>>(i.key(), i.value()));
	std::sort(_invertedIndex.begin(), _invertedIndex.end(), CaseInsensitiveCompare());
	_invertedIndex.squeeze();
}
Example #4
0
// Read every word in this file.
// Insert a pair into the hash table (word,fname)
void processFile (const char *fname, InvertedIndex& inverted_index) {

	ifstream inputfile;  // ifstream for reading from input file.
	inputfile.open (fname);
	string fnames(fname); // file name as a string object, not as a char * (c-style string, which is an array of characters with \0 at the end).

	// Tokenize the input.
	// Read one character at a time.
	// If the character is not in a-z or A-Z, terminate current string.
	char c;
	char curr_str[MAX_STRING_LEN];
	int str_i = 0;  // Index into curr_str.
	bool flush_it = false;  // Whether we have a complete string to flush.
	
	while (inputfile.good()) {
		// Read one character, convert it to lowercase.
		inputfile.get(c);
		c = tolower(c);

		if (c >= 'a' && c <= 'z') {
			// c is a letter.
			curr_str[str_i] = c;
			str_i++;
			
			// Check over-length string.
			if (str_i >= MAX_STRING_LEN) {
				flush_it = true;
			}
		} else {
			// c is not a letter.
			// Create a new string if curr_str is non-empty.
			if (str_i>0) {
				flush_it = true;
			}
		}

		if (flush_it) {
			// Create the new string from curr_str.
			string the_str(curr_str,str_i);
			// cout << the_str << endl;


			// Insert the string-file_name tuple into the inverted index.
			inverted_index.add(the_str,fnames);

			// cout << "Add " << the_str << "," << fname << endl;
			

			// Reset state variables.
			str_i = 0;
			flush_it = false;
		}
	}
}
Example #5
0
//#include "runlength.cpp"
int main (){
	//for (int j = 10000 ; j < 2000000;j+=10000) {
		int j = 0;
		float factor = 0.0;
		//for (float factor = 0.1;factor<3;factor+=0.2) {
			cout << "executing with factor = " << factor << endl;
			InvertedIndex *ii = new InvertedIndex("dataset/gov500.invlist","dataset/gov500.invlistfreq","dataset/gov500.words",500000,j,0);
			ii->buildTreap();
			size_t f_size = ii->estimateFreqSorted();
			vector<size_t> t = ii->estimateDocSorted();
			for (uint i = 0 ; i < t.size();i++) {
				cout << "Estimated doc_id " << t[i] << endl;
			}
			cout << "Estimated Frequency = " << f_size << endl;
			delete ii;
		//}
	//}
	//testRunLength();

}
Example #6
0
    void testOrder()
    {
        InvertedIndex index;

        const Term term1("foo");

        index.insert(term1, 0);
        index.insert(term1, 2);
        index.insert(term1, 1);
        index.insert(term1, 5);
        index.insert(term1, 4);
        index.insert(term1, 3);
        index.insert(term1, 6);

        const Postings &postings = index.getPostings(term1);

        CPPUNIT_ASSERT(postings.size() == 7);
        Postings::const_iterator iter = postings.begin();
        DocId prevId = *iter;
        ++iter;
        for(; iter != postings.end(); ++iter)
        {
            CPPUNIT_ASSERT(prevId < *iter);
            prevId = *iter;
        }
        CPPUNIT_ASSERT(prevId == 6);
    }
	inline void
	predict(std::vector<int> &results,
			size_t k,
			const fv_t &query) const
	{
		InvertedIndex::result_t knn;
		
		m_inverted_index.knn(knn, k, query);
		results.clear();
		for (auto i = knn.begin(); i != knn.end(); ++i) {
			results.push_back(m_centroid_labels[i->id]);
		}
	}
	void
	train(const category_index_t &category_index,
		  const std::vector<fv_t> &data)
	{
		for (auto l = category_index.begin(); l != category_index.end(); ++l) {
			fv_t centroid;
			vector_sum(centroid, l->second, data);
			vector_normalize_l2(centroid);
			m_centroids.push_back(centroid);
			m_centroid_labels.push_back(l->first);
		}
		m_inverted_index.build(&m_centroids);
	}
Example #9
0
void FuzzySearchImpl::buildIndex()
{
	_invertedIndex.clear();
	_qGramIndex.clear();

	// Build inverted index
	for (Service::Item *item : _indexRef) {
		QStringList words = item->title().split(QRegExp("\\W+"), QString::SkipEmptyParts);
		for (QString &w : words)
			_invertedIndex[w.toLower()].insert(item);
	}

	// Build qGramIndex
	for (InvertedIndex::const_iterator it = _invertedIndex.cbegin(); it != _invertedIndex.cend(); ++it)
	{
		//Split the word into lowercase qGrams
		QString spaced = QString(_q-1,' ').append(it.key().toLower());
		for (unsigned int i = 0 ; i < static_cast<unsigned int>(it.key().size()); ++i)
			// Increment #occurences of this qGram in this word
			++_qGramIndex[spaced.mid(i,_q)][it.key()];
	}
}
Example #10
0
float IDF_simple::operator() (const InvertedIndex &index, uint termId, uint listId, uint /*docId*/) const
{
    float f_dt = index.invertedList()[termId][listId].second;
    return 1 + std::log(f_dt);
}
Example #11
0
float TF_simple::operator() (const InvertedIndex &index, uint termId) const
{
    uint ft = index.ft()[termId];
    return std::log(1 + index.numOfDocuments() / static_cast<float>(ft));
}
Example #12
0
void Tokenizer::execute(wchar const * it , wchar const * endit , InvertedIndex & inverted_index , unsigned document_id)
{
	Token token;

	auto not_character = character_.end();
	auto not_delimiter = delimiter_.end();

	while (it != endit)
	{	//	Read a sequence of 6 words
		size_t counter = 6 - token.size();
		bool separated = false;
		for (; counter > 0; --counter)
		{	//	Eliminate all delimiters
			while (it != endit && character_.find(*it) == not_character)
			{
				if (delimiter_.find(*it) != not_delimiter)
				{	//	If it is a delimiter, eliminate all of 'em
					do
					{
						++it;
					} while (it != endit && delimiter_.find(*it) != not_delimiter);
					//	If it is a delimiter, it must not read more words
					//	=> There is no delimiter can present in the middle of a token
					separated = true;
					break;
				}
				++it;
			}

			if (it == endit)
			{	//	End of file
				separated = true;
				break;
			}

			if (separated)
				if (!token.empty())
					//	Delimiters found => must not read more words
					break;
				else
				{	//	Delimiteres found => Token is still empty => continue
					separated = false;
					++counter;
					continue;
				}

			wchar const * begin = it;
			do
			{	//	Read characters of a word in the token
				++it;
			} while (it != endit && character_.find(*it) != not_character);

			wstring origin = wstring(begin , it - begin);
			size_t length = origin.length();
			//	Lowercase the word
			for (size_t i = 0; i < length; ++i)
				origin[i] = character_.at(origin[i]);
			token.push_back(origin);
		}

		do
		{
			size_t counter;
			while (true)
			{	//	Eliminate stop words
				counter = stopword_.max_match(token);
				if (counter == 0)
					break;
				else
				{
					do
					{
						token.pop_front();
						--counter;
					} while (counter > 0);
					if (token.empty())
						break;
				}
			}
			if (token.empty())
				break;
			counter = vocabulary.max_match(token);
			if (counter > 0)
			{	//	If there is a token in vocabulary, insert it
				inverted_index.insert(token.cbegin() , token.cbegin() + counter , document_id);
				//token = Token(token.cbegin() + counter , token.cend());
				do
				{
					token.pop_front();
					--counter;
				} while (counter > 0);
			}
			else
			{	//	Otherwise, only insert the first word
				inverted_index.insert(token.cbegin() , token.cbegin() + 1 , document_id);
				//token = Token(token.cbegin() + 1 , token.cend());
				token.pop_front();
			}
		} while (separated);
	}
}
// ___________________________________________________________________________
TEST(ApproximateMatching, init) {
  ii.buildFromCsvFile(mockupFileName);
  approximateMatching.init(ii, 5, '+');
  EXPECT_EQ('+', approximateMatching.dummyChar());
  EXPECT_EQ(5, approximateMatching.k());
}
Example #14
0
    void testInsert()
    {
        InvertedIndex index;

        const Term term1("foo");
        index.insert(term1, 0);
        CPPUNIT_ASSERT(index.getPostings(term1).size() == 1);
        CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0));

        index.insert(term1, 1);
        CPPUNIT_ASSERT(index.getPostings(term1).size() == 2);
        CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0));
        CPPUNIT_ASSERT(isContain(index.getPostings(term1), 1));
        
        CPPUNIT_ASSERT(!isContain(index.getPostings(term1), 2));

        const Term term2("bar");
        index.insert(term2, 0);
        CPPUNIT_ASSERT(index.getPostings(term2).size() == 1);
        CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0));

        index.insert(term2, 1);
        CPPUNIT_ASSERT(index.getPostings(term2).size() == 2);
        CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0));
        CPPUNIT_ASSERT(isContain(index.getPostings(term2), 1));
        
        CPPUNIT_ASSERT(!isContain(index.getPostings(term2), 2));
    }
void testIndexData()
{
    /// Create Schema
    Schema *schema = Schema::create(srch2::instantsearch::DefaultIndex);
    schema->setPrimaryKey("article_id"); // integer, not searchable
    schema->setSearchableAttribute("article_id"); // convert id to searchable text
    schema->setSearchableAttribute("article_authors", 2); // searchable text
    schema->setSearchableAttribute("article_title", 7); // searchable text

    /// Create Analyzer
    SynonymContainer *syn = SynonymContainer::getInstance("", SYNONYM_DONOT_KEEP_ORIGIN);
    syn->init();

    Analyzer *analyzer = new Analyzer(NULL, NULL, NULL, syn, "");

    /// Create IndexData
    string INDEX_DIR = ".";
    IndexData *indexData = IndexData::create(INDEX_DIR,
                                            analyzer,
                                            schema,
                                            srch2::instantsearch::DISABLE_STEMMER_NORMALIZER);

    Record *record = new Record(schema);

    record->setPrimaryKey(1001);
    record->setSearchableAttributeValue("article_authors", "Tom Smith and Jack Lennon");
    record->setSearchableAttributeValue("article_title", "come Yesterday Once More");
    record->setRecordBoost(10);
    indexData->_addRecord(record, analyzer);

    record->clear();
    record->setPrimaryKey(1008);
    record->setSearchableAttributeValue(0, "Jimi Hendrix");
    record->setSearchableAttributeValue(1, "Little wing");
    record->setRecordBoost(90);
    indexData->_addRecord(record, analyzer);

    indexData->finishBulkLoad();
    //index->print_Index();

    record->clear();
    record->setPrimaryKey(1007);
    record->setSearchableAttributeValue(0, "Jimaai Hendaarix");
    record->setSearchableAttributeValue(1, "Littaale waaing");
    record->setRecordBoost(90);
    indexData->_addRecord(record, analyzer);

    //index->print_Index();

    /// test Trie
    Trie_Internal *trie = indexData->trie;

    typedef boost::shared_ptr<TrieRootNodeAndFreeList > TrieRootNodeSharedPtr;
    TrieRootNodeSharedPtr rootSharedPtr;
    trie->getTrieRootNode_ReadView(rootSharedPtr);
    TrieNode *root = rootSharedPtr->root;

    (void)(root);

    ASSERT( trie->getTrieNodeFromUtf8String( root, "and")->getId() < trie->getTrieNodeFromUtf8String( root, "come")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "come")->getId() < trie->getTrieNodeFromUtf8String( root, "hendrix")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "hendrix")->getId() < trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "jack")->getId() < trie->getTrieNodeFromUtf8String( root, "jimi")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "jimi")->getId() < trie->getTrieNodeFromUtf8String( root, "lennon")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "lennon")->getId() < trie->getTrieNodeFromUtf8String( root, "little")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "little")->getId() < trie->getTrieNodeFromUtf8String( root, "more")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "more")->getId() < trie->getTrieNodeFromUtf8String( root, "once")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "once")->getId() < trie->getTrieNodeFromUtf8String( root, "smith")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "smith")->getId() < trie->getTrieNodeFromUtf8String( root, "tom")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "tom")->getId() < trie->getTrieNodeFromUtf8String( root, "wing")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "wing")->getId() < trie->getTrieNodeFromUtf8String( root, "yesterday")->getId() );

    // we assume that there is no background thread does merge,
    // or even if there is such a background thread, it didn't have a chance to do the merge
    ASSERT( trie->getTrieNodeFromUtf8String( root, "jimaai") == NULL );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "Hendaarix") == NULL );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "Littaale") == NULL );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "waaing") == NULL );

    ASSERT( trie->getTrieNodeFromUtf8String( root, "j")->getMinId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "j")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "jimi")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "ja")->getMinId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "ja")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "jack")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "win")->getMinId() == trie->getTrieNodeFromUtf8String( root, "wing")->getId() );
    ASSERT( trie->getTrieNodeFromUtf8String( root, "win")->getMaxId() == trie->getTrieNodeFromUtf8String( root, "wing")->getId() );

    /// test ForwardIndex
    ForwardIndex *forwardIndex = indexData->forwardIndex;
    shared_ptr<vectorview<ForwardListPtr> > forwardListDirectoryReadView;
    forwardIndex->getForwardListDirectory_ReadView(forwardListDirectoryReadView);
    float score = 0;
    unsigned keywordId = 1;
    // define the attributeBitmap only in debug mode
#if ASSERT_LEVEL > 0
    vector<unsigned> attributeBitmap;
#endif
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 0,
    		trie->getTrieNodeFromUtf8String( root, "jack")->getId(),
    		trie->getTrieNodeFromUtf8String( root, "lennon")->getId(),
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == true );
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 0,
    		trie->getTrieNodeFromUtf8String( root, "smith")->getId() + 1,
    		trie->getTrieNodeFromUtf8String( root, "tom")->getId() - 1,
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == false );
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 1,
    		trie->getTrieNodeFromUtf8String( root, "hendrix")->getId(),
    		trie->getTrieNodeFromUtf8String( root, "jimi")->getId(),
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == true );
    ASSERT( forwardIndex->haveWordInRange(forwardListDirectoryReadView, 1,
    		trie->getTrieNodeFromUtf8String( root, "wing")->getId() + 1,
    		trie->getTrieNodeFromUtf8String( root, "wing")->getId() + 2,
    		vector<unsigned>(), ATTRIBUTES_OP_AND,
    		keywordId, attributeBitmap, score) == false );

    /// test InvertedIndex
    InvertedIndex *invertedIndex = indexData->invertedIndex;

    (void)(forwardIndex);
    (void)(invertedIndex);
    (void)score;
    (void)keywordId;

    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "and")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "come")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "hendrix")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "jack")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "jimi")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "lennon")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "little")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "more")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "once")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "smith")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "tom")->getInvertedListOffset() ) == 1);
    ASSERT(invertedIndex->getInvertedListSize_ReadView( trie->getTrieNodeFromUtf8String( root, "wing")->getInvertedListOffset() ) == 1);


    delete schema;
    delete record;
    delete analyzer;
    delete indexData;
    syn->free();
}
	bool
	load(const char *file)
	{
		FILE *fp = std::fopen(file, "rb");
		
		if (fp == 0) {
			return false;
		}
		m_centroids.clear();
		m_centroid_labels.clear();
		m_inverted_index.clear();
		
		size_t centroid_num = 0;
		size_t ret = std::fread(&centroid_num, sizeof(centroid_num), 1, fp);
		if (ret != 1) {
			std::fprintf(stderr, "%s: invalid format 1\n", file);
			fclose(fp);
			return false;
		}
		for (size_t i = 0; i < centroid_num; ++i) {
			fv_t centroid;
			size_t word_num = 0;
			ret = fread(&word_num, sizeof(word_num), 1, fp);
			if (ret != 1) {
				std::fprintf(stderr, "%s: invalid format 2\n", file);
				fclose(fp);
				return false;
			}
			for (size_t j = 0; j < word_num; ++j) {
				int word_id;
				float word_weight;
				ret = std::fread(&word_id, sizeof(word_id), 1, fp);
				if (ret != 1) {
					std::fprintf(stderr, "%s: invalid format 3\n", file);
					fclose(fp);
					return false;
				}
				ret = std::fread(&word_weight, sizeof(word_weight), 1, fp);
				if (ret != 1) {
					std::fprintf(stderr, "%s: invalid format 4\n", file);
					fclose(fp);
					return false;
				}
				centroid.insert(std::make_pair(word_id, word_weight));
			}
			m_centroids.push_back(centroid);
		}
		ret = std::fread(&centroid_num, sizeof(centroid_num), 1, fp);
		if (ret != 1) {
			std::fprintf(stderr, "%s: invalid format 5\n", file);
			fclose(fp);
			return false;
		}
		int *buffer = new int[centroid_num];
		ret = std::fread(buffer, sizeof(int), centroid_num, fp);
		if (ret != centroid_num) {
			std::fprintf(stderr, "%s: invalid format 6\n", file);
			delete buffer;
			fclose(fp);
			return false;
		}
		std::copy(buffer, buffer + centroid_num,
				  std::back_inserter(m_centroid_labels));
		delete buffer;
		
		fclose(fp);
		
		m_inverted_index.build(&m_centroids);
		
		return true;
	}