Example #1
0
    void testOrder()
    {
        InvertedIndex index;

        const Term term1("foo");

        index.insert(term1, 0);
        index.insert(term1, 2);
        index.insert(term1, 1);
        index.insert(term1, 5);
        index.insert(term1, 4);
        index.insert(term1, 3);
        index.insert(term1, 6);

        const Postings &postings = index.getPostings(term1);

        CPPUNIT_ASSERT(postings.size() == 7);
        Postings::const_iterator iter = postings.begin();
        DocId prevId = *iter;
        ++iter;
        for(; iter != postings.end(); ++iter)
        {
            CPPUNIT_ASSERT(prevId < *iter);
            prevId = *iter;
        }
        CPPUNIT_ASSERT(prevId == 6);
    }
Example #2
0
    void testInsert()
    {
        InvertedIndex index;

        const Term term1("foo");
        index.insert(term1, 0);
        CPPUNIT_ASSERT(index.getPostings(term1).size() == 1);
        CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0));

        index.insert(term1, 1);
        CPPUNIT_ASSERT(index.getPostings(term1).size() == 2);
        CPPUNIT_ASSERT(isContain(index.getPostings(term1), 0));
        CPPUNIT_ASSERT(isContain(index.getPostings(term1), 1));
        
        CPPUNIT_ASSERT(!isContain(index.getPostings(term1), 2));

        const Term term2("bar");
        index.insert(term2, 0);
        CPPUNIT_ASSERT(index.getPostings(term2).size() == 1);
        CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0));

        index.insert(term2, 1);
        CPPUNIT_ASSERT(index.getPostings(term2).size() == 2);
        CPPUNIT_ASSERT(isContain(index.getPostings(term2), 0));
        CPPUNIT_ASSERT(isContain(index.getPostings(term2), 1));
        
        CPPUNIT_ASSERT(!isContain(index.getPostings(term2), 2));
    }
Example #3
0
void Tokenizer::execute(wchar const * it , wchar const * endit , InvertedIndex & inverted_index , unsigned document_id)
{
	Token token;

	auto not_character = character_.end();
	auto not_delimiter = delimiter_.end();

	while (it != endit)
	{	//	Read a sequence of 6 words
		size_t counter = 6 - token.size();
		bool separated = false;
		for (; counter > 0; --counter)
		{	//	Eliminate all delimiters
			while (it != endit && character_.find(*it) == not_character)
			{
				if (delimiter_.find(*it) != not_delimiter)
				{	//	If it is a delimiter, eliminate all of 'em
					do
					{
						++it;
					} while (it != endit && delimiter_.find(*it) != not_delimiter);
					//	If it is a delimiter, it must not read more words
					//	=> There is no delimiter can present in the middle of a token
					separated = true;
					break;
				}
				++it;
			}

			if (it == endit)
			{	//	End of file
				separated = true;
				break;
			}

			if (separated)
				if (!token.empty())
					//	Delimiters found => must not read more words
					break;
				else
				{	//	Delimiteres found => Token is still empty => continue
					separated = false;
					++counter;
					continue;
				}

			wchar const * begin = it;
			do
			{	//	Read characters of a word in the token
				++it;
			} while (it != endit && character_.find(*it) != not_character);

			wstring origin = wstring(begin , it - begin);
			size_t length = origin.length();
			//	Lowercase the word
			for (size_t i = 0; i < length; ++i)
				origin[i] = character_.at(origin[i]);
			token.push_back(origin);
		}

		do
		{
			size_t counter;
			while (true)
			{	//	Eliminate stop words
				counter = stopword_.max_match(token);
				if (counter == 0)
					break;
				else
				{
					do
					{
						token.pop_front();
						--counter;
					} while (counter > 0);
					if (token.empty())
						break;
				}
			}
			if (token.empty())
				break;
			counter = vocabulary.max_match(token);
			if (counter > 0)
			{	//	If there is a token in vocabulary, insert it
				inverted_index.insert(token.cbegin() , token.cbegin() + counter , document_id);
				//token = Token(token.cbegin() + counter , token.cend());
				do
				{
					token.pop_front();
					--counter;
				} while (counter > 0);
			}
			else
			{	//	Otherwise, only insert the first word
				inverted_index.insert(token.cbegin() , token.cbegin() + 1 , document_id);
				//token = Token(token.cbegin() + 1 , token.cend());
				token.pop_front();
			}
		} while (separated);
	}
}