TIMED_TEST(LexiconTests, initializerListTest_Lexicon, TEST_TIMEOUT_DEFAULT) { std::initializer_list<std::string> lexlist = {"sixty", "seventy"}; std::initializer_list<std::string> lexallwords = { "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy" }; Lexicon lex {"ten", "twenty", "thirty"}; assertEqualsString("init list Lexicon", "{\"ten\", \"thirty\", \"twenty\"}", lex.toString()); assertEqualsInt("init list Lexicon size", 3, lex.size()); assertTrue("init list Lexicon contains ten", lex.contains("ten")); assertTrue("init list Lexicon contains twenty", lex.contains("twenty")); assertTrue("init list Lexicon contains thirty", lex.contains("thirty")); assertFalse("init list Lexicon contains forty", lex.contains("forty")); assertFalse("init list Lexicon contains fifty", lex.contains("fifty")); lex += {"forty", "fifty"}; assertEqualsString("after += Lexicon", "{\"fifty\", \"forty\", \"ten\", \"thirty\", \"twenty\"}", lex.toString()); assertEqualsInt("after += Lexicon size", 5, lex.size()); assertTrue("init list Lexicon contains ten", lex.contains("ten")); assertTrue("init list Lexicon contains twenty", lex.contains("twenty")); assertTrue("init list Lexicon contains thirty", lex.contains("thirty")); assertTrue("init list Lexicon contains forty", lex.contains("forty")); assertTrue("init list Lexicon contains fifty", lex.contains("fifty")); assertFalse("init list Lexicon contains sixty", lex.contains("sixty")); assertFalse("init list Lexicon contains seventy", lex.contains("seventy")); Lexicon lex2 = (lex + lexlist); assertEqualsString("after += Lexicon", "{\"fifty\", \"forty\", \"ten\", \"thirty\", \"twenty\"}", lex.toString()); assertEqualsInt("after + Lexicon size", 5, lex.size()); assertTrue("init list Lexicon contains ten", lex.contains("ten")); assertTrue("init list Lexicon contains twenty", lex.contains("twenty")); assertTrue("init list Lexicon contains thirty", lex.contains("thirty")); assertTrue("init list Lexicon contains forty", lex.contains("forty")); assertTrue("init list Lexicon contains fifty", lex.contains("fifty")); assertFalse("init list Lexicon contains sixty", lex.contains("sixty")); assertFalse("init list Lexicon contains seventy", lex.contains("seventy")); assertEqualsString("after + Lexicon 2", "{\"fifty\", \"forty\", \"seventy\", \"sixty\", \"ten\", \"thirty\", \"twenty\"}", lex2.toString()); assertEqualsInt("after + Lexicon 2 size", 7, lex2.size()); assertTrue("init list Lexicon contains ten", lex2.contains("ten")); assertTrue("init list Lexicon contains twenty", lex2.contains("twenty")); assertTrue("init list Lexicon contains thirty", lex2.contains("thirty")); assertTrue("init list Lexicon contains forty", lex2.contains("forty")); assertTrue("init list Lexicon contains fifty", lex2.contains("fifty")); assertTrue("init list Lexicon contains sixty", lex2.contains("sixty")); assertTrue("init list Lexicon contains seventy", lex2.contains("seventy")); }
bool Lexicon::equals(const Lexicon& lex2) const { // optimization: if literally same lexicon, stop if (this == &lex2) { return true; } if (size() != lex2.size()) { return false; } return m_allWords == lex2.m_allWords; }
TIMED_TEST(LexiconTests, basicTest_Lexicon, TEST_TIMEOUT_DEFAULT) { std::initializer_list<std::string> words = { "a", "ab", "aab", "aaab", "aardvark", "b", "banana" }; std::initializer_list<std::string> badWords = { "abb", "ad", "and", "aaardvark", "aardvarks", }; std::initializer_list<std::string> badPrefixes = { "aaaa", "abb", "aardvarz", "bb", "bananas", "c", "r", "z" }; Lexicon lex; for (std::string word : words) { lex.add(word); } assertEquals("Lexicon size", words.size(), lex.size()); for (std::string word : words) { assertTrue("Lexicon contains " + word, lex.contains(word)); } for (std::string word : badWords) { assertFalse("Lexicon contains " + word, lex.contains(word)); } for (std::string word : words) { for (int i = 0; i < (int) word.length(); i++) { std::string prefix = word.substr(0, i); assertTrue("Lexicon containsPrefix " + word, lex.containsPrefix(word)); } } for (std::string word : badPrefixes) { assertFalse("Lexicon containsPrefix " + word, lex.containsPrefix(word)); } }
/* main */ int main() { Lexicon english; english.addWordsFromFile("dictionary.txt"); int total = 0; int totalWords = english.size(); // C++11 "for" iteration loop for (string str : english) { string prefix = "s" + str; string suffix = str + "s"; if (english.contains(prefix) && english.contains(suffix)) { cout << str << " :: " << prefix << " & " << suffix << endl; total++; } } cout << endl << "The total number of words: " << totalWords << endl; cout << "The total S words: " << total << endl; return 0; }
void CCmpLexicon::Reduce(HStreamBase& inFile) { HSwapStream<net_swapper> data(inFile); lexicon.push_back(LexEntry("\x1b", 0)); RestCharacters chars; uint32 i; for (i = 0; i < 256; ++i) { chars.push_back(RestChar()); chars[i].ch = static_cast<unsigned char>(i); chars[i].cnt = 1; // was 0 chars[i].code = 0; } uint32 n, h; // try to reduce the lexicon size to something reasonable LexiconSet::iterator w; n = word_set.size(); HAutoBuf<uint32> A_(new uint32[n * 2]); uint32* A = A_.get(); HAutoBuf<const char*> str(new const char*[n]); uint32 s = 0; i = 0; for (w = word_set.begin(); w != word_set.end(); ++w, ++i) { A[i] = i + n; A[i + n] = (*w).second; str[i] = (*w).first; s += strlen(str[i]) + 1; } // word_set.clear(); word_set = LexiconSet(); h = n; make_heap(A, A + h, CntCompare(A)); while (s > max_size) { const char* t = str[A[0] - n]; ++lexicon.front().cnt; for (const char* p = t; *p; ++p) ++chars[static_cast<unsigned char>(*p)].cnt; ++chars[0].cnt; s -= strlen(t) + 1; A[0] = A[h - 1]; --h; pop_heap(A, A + h, CntCompare(A)); } for (i = 0; i < h; ++i) lexicon.push_back(LexEntry(str[A[i] - n], A[A[i]])); sort(lexicon.begin() + 1, lexicon.end()); n = lexicon.size(); A = new uint32[n * 2]; for (i = 0; i < n; ++i) { A[i] = i + n; A[i + n] = lexicon[i].cnt; } h = n; make_heap(A, A + h, CntCompare(A)); while (h > 1) { uint32 m1 = A[0]; A[0] = A[h - 1]; --h; pop_heap(A, A + h, CntCompare(A)); uint32 m2 = A[0]; A[0] = A[h - 1]; A[h] = A[m1] + A[m2]; A[0] = h; A[m1] = A[m2] = h; pop_heap(A, A + h); } A[1] = 0; for (i = 2; i < 2 * n; ++i) A[i] = A[A[i]] + 1; for (i = 0; i < n; ++i) lexicon[i].cnt = A[i + n]; uint32 numl[32]; uint32 firstcode[32]; uint32 nextcode[32]; for (i = 0; i < 32; ++i) numl[i] = 0; for (i = 0; i < n; ++i) ++numl[A[i + n]]; firstcode[31] = 0; for (int l = 30; l >= 0; --l) firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2; for (int l = 0; l < 32; ++l) nextcode[l] = firstcode[l]; HAutoBuf<uint32> symbol_table(new uint32[n]); uint32 six[32]; six[0] = 0; for (i = 1; i < 32; ++i) six[i] = six[i - 1] + numl[i - 1]; for (i = 0; i < n; ++i) { uint32 li = A[i + n]; lexicon[i].code = nextcode[li]; symbol_table[six[li] + nextcode[li] - firstcode[li]] = i; ++nextcode[li]; } data << n; for (i = 0; i < 32; ++i) data << firstcode[i]; for (i = 0; i < 32; ++i) data << six[i]; uint32 symbol_text_length = 0; for (i = 0; i < n; ++i) symbol_text_length += strlen(lexicon[symbol_table[i]].text) + 1; symbol_text = new char[symbol_text_length]; char* d = symbol_text; for (i = 0; i < n; ++i) { strcpy(d, lexicon[symbol_table[i]].text); lexicon[symbol_table[i]].text = d; symbol_table[i] = static_cast<uint32>(d - symbol_text); d += strlen(d) + 1; } data << symbol_text_length; data.Write(symbol_text, symbol_text_length); // and now repeat all steps for the rest characters // Count how many characters we actually have: n = 0; rest = chars; // for (RestCharacters::iterator i = chars.begin(); i != chars.end(); ++i) // { // if ((*i).cnt != 0) // { // rest.push_back(*i); // rest.back().cnt = 0; // } // } n = rest.size(); A_.reset(new uint32[n * 2]); A = A_.get(); for (i = 0; i < n; ++i) { A[i] = i + n; A[i + n] = rest[i].cnt; } h = n; make_heap(A, A + h, CntCompare(A)); while (h > 1) { uint32 m1 = A[0]; A[0] = A[h - 1]; --h; pop_heap(A, A + h, CntCompare(A)); uint32 m2 = A[0]; A[0] = A[h - 1]; A[h] = A[m1] + A[m2]; A[0] = h; A[m1] = A[m2] = h; pop_heap(A, A + h); } A[1] = 0; for (i = 2; i < 2 * n; ++i) A[i] = A[A[i]] + 1; for (i = 0; i < n; ++i) rest[i].cnt = A[i + n]; for (i = 0; i < 32; ++i) numl[i] = 0; for (i = 0; i < n; ++i) ++numl[A[i + n]]; firstcode[31] = 0; for (int l = 30; l >= 0; --l) firstcode[l] = (firstcode[l + 1] + numl[l + 1]) / 2; for (int l = 0; l < 32; ++l) nextcode[l] = firstcode[l]; six[0] = 0; for (i = 1; i < 32; ++i) six[i] = six[i - 1] + numl[i - 1]; HAutoBuf<unsigned char> char_symbol_table(new unsigned char[n]); for (i = 0; i < n; ++i) { uint32 li = A[i + n]; rest[i].code = nextcode[li]; char_symbol_table[six[li] + nextcode[li] - firstcode[li]] = rest[i].ch; ++nextcode[li]; } data << n; for (i = 0; i < 32; ++i) data << firstcode[i]; for (i = 0; i < 32; ++i) data << six[i]; data.Write(char_symbol_table.get(), n); }