TIMED_TEST(LexiconTests, initializerListTest_Lexicon, TEST_TIMEOUT_DEFAULT) {
    std::initializer_list<std::string> lexlist = {"sixty", "seventy"};
    std::initializer_list<std::string> lexallwords = {
        "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy"
    };

    Lexicon lex {"ten", "twenty", "thirty"};
    assertEqualsString("init list Lexicon", "{\"ten\", \"thirty\", \"twenty\"}", lex.toString());
    assertEqualsInt("init list Lexicon size", 3, lex.size());
    assertTrue("init list Lexicon contains ten", lex.contains("ten"));
    assertTrue("init list Lexicon contains twenty", lex.contains("twenty"));
    assertTrue("init list Lexicon contains thirty", lex.contains("thirty"));
    assertFalse("init list Lexicon contains forty", lex.contains("forty"));
    assertFalse("init list Lexicon contains fifty", lex.contains("fifty"));

    lex += {"forty", "fifty"};
    assertEqualsString("after += Lexicon", "{\"fifty\", \"forty\", \"ten\", \"thirty\", \"twenty\"}", lex.toString());
    assertEqualsInt("after += Lexicon size", 5, lex.size());
    assertTrue("init list Lexicon contains ten", lex.contains("ten"));
    assertTrue("init list Lexicon contains twenty", lex.contains("twenty"));
    assertTrue("init list Lexicon contains thirty", lex.contains("thirty"));
    assertTrue("init list Lexicon contains forty", lex.contains("forty"));
    assertTrue("init list Lexicon contains fifty", lex.contains("fifty"));
    assertFalse("init list Lexicon contains sixty", lex.contains("sixty"));
    assertFalse("init list Lexicon contains seventy", lex.contains("seventy"));

    Lexicon lex2 = (lex + lexlist);
    assertEqualsString("after += Lexicon", "{\"fifty\", \"forty\", \"ten\", \"thirty\", \"twenty\"}", lex.toString());
    assertEqualsInt("after + Lexicon size", 5, lex.size());
    assertTrue("init list Lexicon contains ten", lex.contains("ten"));
    assertTrue("init list Lexicon contains twenty", lex.contains("twenty"));
    assertTrue("init list Lexicon contains thirty", lex.contains("thirty"));
    assertTrue("init list Lexicon contains forty", lex.contains("forty"));
    assertTrue("init list Lexicon contains fifty", lex.contains("fifty"));
    assertFalse("init list Lexicon contains sixty", lex.contains("sixty"));
    assertFalse("init list Lexicon contains seventy", lex.contains("seventy"));

    assertEqualsString("after + Lexicon 2", "{\"fifty\", \"forty\", \"seventy\", \"sixty\", \"ten\", \"thirty\", \"twenty\"}", lex2.toString());
    assertEqualsInt("after + Lexicon 2 size", 7, lex2.size());
    assertTrue("init list Lexicon contains ten", lex2.contains("ten"));
    assertTrue("init list Lexicon contains twenty", lex2.contains("twenty"));
    assertTrue("init list Lexicon contains thirty", lex2.contains("thirty"));
    assertTrue("init list Lexicon contains forty", lex2.contains("forty"));
    assertTrue("init list Lexicon contains fifty", lex2.contains("fifty"));
    assertTrue("init list Lexicon contains sixty", lex2.contains("sixty"));
    assertTrue("init list Lexicon contains seventy", lex2.contains("seventy"));
}
Пример #2
0
bool Lexicon::equals(const Lexicon& lex2) const {
    // optimization: if literally same lexicon, stop
    if (this == &lex2) {
        return true;
    }
    if (size() != lex2.size()) {
        return false;
    }
    return m_allWords == lex2.m_allWords;
}
TIMED_TEST(LexiconTests, basicTest_Lexicon, TEST_TIMEOUT_DEFAULT) {
    std::initializer_list<std::string> words = {
        "a",
        "ab",
        "aab",
        "aaab",
        "aardvark",
        "b",
        "banana"
    };
    std::initializer_list<std::string> badWords = {
        "abb",
        "ad",
        "and",
        "aaardvark",
        "aardvarks",
    };
    std::initializer_list<std::string> badPrefixes = {
        "aaaa",
        "abb",
        "aardvarz",
        "bb",
        "bananas",
        "c",
        "r",
        "z"
    };

    Lexicon lex;
    for (std::string word : words) {
        lex.add(word);
    }
    assertEquals("Lexicon size", words.size(), lex.size());

    for (std::string word : words) {
        assertTrue("Lexicon contains " + word, lex.contains(word));
    }

    for (std::string word : badWords) {
        assertFalse("Lexicon contains " + word, lex.contains(word));
    }

    for (std::string word : words) {
        for (int i = 0; i < (int) word.length(); i++) {
            std::string prefix = word.substr(0, i);
            assertTrue("Lexicon containsPrefix " + word, lex.containsPrefix(word));
        }
    }

    for (std::string word : badPrefixes) {
        assertFalse("Lexicon containsPrefix " + word, lex.containsPrefix(word));
    }
}
Пример #4
0
/* main */
int main() {
    Lexicon english;
    english.addWordsFromFile("dictionary.txt");
    int total = 0;
    int totalWords = english.size();
    // C++11 "for" iteration loop
    for (string str : english) {
        string prefix = "s" + str;
        string suffix = str + "s";
        if (english.contains(prefix) && english.contains(suffix)) {
            cout << str << " :: " << prefix << " & " << suffix << endl;
            total++;
        }
    }
    cout << endl << "The total number of words: " << totalWords << endl;
    cout << "The total S words: " << total << endl;
    return 0;
}
Пример #5
0
void CCmpLexicon::Reduce(HStreamBase& inFile)
{
	HSwapStream<net_swapper> data(inFile);
	
	lexicon.push_back(LexEntry("\x1b", 0));
	
	RestCharacters chars;
	uint32 i;
	
	for (i = 0; i < 256; ++i)
	{
		chars.push_back(RestChar());
		chars[i].ch = static_cast<unsigned char>(i);
		chars[i].cnt = 1;	// was 0
		chars[i].code = 0;
	}

	uint32 n, h;

	// try to reduce the lexicon size to something reasonable
	LexiconSet::iterator w;

	n = word_set.size();
	HAutoBuf<uint32> A_(new uint32[n * 2]);
	uint32* A = A_.get();
	
	HAutoBuf<const char*> str(new const char*[n]);
	
	uint32 s = 0;
	i = 0;
	for (w = word_set.begin(); w != word_set.end(); ++w, ++i)
	{
		A[i] = i + n;
		A[i + n] = (*w).second;
		str[i] = (*w).first;
		s += strlen(str[i]) + 1;
	}

//	word_set.clear();
	word_set = LexiconSet();
	
	h = n;
	make_heap(A, A + h, CntCompare(A));
	
	while (s > max_size)
	{
		const char* t = str[A[0] - n];
		
		++lexicon.front().cnt;
		
		for (const char* p = t; *p; ++p)
			++chars[static_cast<unsigned char>(*p)].cnt;
		++chars[0].cnt;
		
		s -= strlen(t) + 1;
		A[0] = A[h - 1];
		--h;
		pop_heap(A, A + h, CntCompare(A));
	}
	
	for (i = 0; i < h; ++i)
		lexicon.push_back(LexEntry(str[A[i] - n], A[A[i]]));

	sort(lexicon.begin() + 1, lexicon.end());
	
	n = lexicon.size();
	A = new uint32[n * 2];

	for (i = 0; i < n; ++i)
	{
		A[i] = i + n;
		A[i + n] = lexicon[i].cnt;
	}
	
	h = n;
	make_heap(A, A + h, CntCompare(A));
	
	while (h > 1)
	{
		uint32 m1 = A[0];
		A[0] = A[h - 1];
		--h;
		pop_heap(A, A + h, CntCompare(A));
		
		uint32 m2 = A[0];
		A[0] = A[h - 1];
		
		A[h] = A[m1] + A[m2];
		A[0] = h;
		A[m1] = A[m2] = h;
		
		pop_heap(A, A + h);
	}
	
	A[1] = 0;
	for (i = 2; i < 2 * n; ++i)
		A[i] = A[A[i]] + 1;
	
	for (i = 0; i < n; ++i)
		lexicon[i].cnt = A[i + n];

	uint32 numl[32];
	uint32 firstcode[32];
	uint32 nextcode[32];
	
	for (i = 0; i < 32; ++i)
		numl[i] = 0;
	
	for (i = 0; i < n; ++i)
		++numl[A[i + n]];
	
	firstcode[31] = 0;
	for (int l = 30; l >= 0; --l)
		firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2;
	
	for (int l = 0; l < 32; ++l)
		nextcode[l] = firstcode[l];
	
	HAutoBuf<uint32> symbol_table(new uint32[n]);
	
	uint32 six[32];
	six[0] = 0;
	for (i = 1; i < 32; ++i)
		six[i] = six[i - 1] + numl[i - 1];
	
	for (i = 0; i < n; ++i)
	{
		uint32 li = A[i + n];
		
		lexicon[i].code = nextcode[li];
		symbol_table[six[li] + nextcode[li] - firstcode[li]] = i;
		++nextcode[li];
	}
	
	data << n;
	for (i = 0; i < 32; ++i)	data << firstcode[i];
	for (i = 0; i < 32; ++i)	data << six[i];

	uint32 symbol_text_length = 0;
	for (i = 0; i < n; ++i)
		symbol_text_length += strlen(lexicon[symbol_table[i]].text) + 1;
	
	symbol_text = new char[symbol_text_length];
	char* d = symbol_text;
	
	for (i = 0; i < n; ++i)
	{
		strcpy(d, lexicon[symbol_table[i]].text);
		lexicon[symbol_table[i]].text = d;
		symbol_table[i] = static_cast<uint32>(d - symbol_text);
		d += strlen(d) + 1;
	}

	data << symbol_text_length;
	data.Write(symbol_text, symbol_text_length);

	// and now repeat all steps for the rest characters
	
	// Count how many characters we actually have:
	n = 0;
	
	rest = chars;
//	for (RestCharacters::iterator i = chars.begin(); i != chars.end(); ++i)
//	{
//		if ((*i).cnt != 0)
//		{
//			rest.push_back(*i);
//			rest.back().cnt = 0;
//		}
//	}

	n = rest.size();
	A_.reset(new uint32[n * 2]);
	A = A_.get();

	for (i = 0; i < n; ++i)
	{
		A[i] = i + n;
		A[i + n] = rest[i].cnt;
	}
	
	h = n;
	make_heap(A, A + h, CntCompare(A));
	
	while (h > 1)
	{
		uint32 m1 = A[0];
		A[0] = A[h - 1];
		--h;
		pop_heap(A, A + h, CntCompare(A));
		
		uint32 m2 = A[0];
		A[0] = A[h - 1];
		
		A[h] = A[m1] + A[m2];
		A[0] = h;
		A[m1] = A[m2] = h;
		
		pop_heap(A, A + h);
	}
	
	A[1] = 0;
	for (i = 2; i < 2 * n; ++i)
		A[i] = A[A[i]] + 1;
	
	for (i = 0; i < n; ++i)
		rest[i].cnt = A[i + n];

	for (i = 0; i < 32; ++i)
		numl[i] = 0;
	
	for (i = 0; i < n; ++i)
		++numl[A[i + n]];
	
	firstcode[31] = 0;
	for (int l = 30; l >= 0; --l)
		firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2;
	
	for (int l = 0; l < 32; ++l)
		nextcode[l] = firstcode[l];
	
	six[0] = 0;
	for (i = 1; i < 32; ++i)
		six[i] = six[i - 1] + numl[i - 1];
	
	HAutoBuf<unsigned char> char_symbol_table(new unsigned char[n]);
	
	for (i = 0; i < n; ++i)
	{
		uint32 li = A[i + n];
		
		rest[i].code = nextcode[li];
		char_symbol_table[six[li] + nextcode[li] - firstcode[li]] = rest[i].ch;
		++nextcode[li];
	}
	
	data << n;
	for (i = 0; i < 32; ++i)	data << firstcode[i];
	for (i = 0; i < 32; ++i)	data << six[i];
	data.Write(char_symbol_table.get(), n);
}