/** SQLite callback function Builds prediction from query results. */ int buildPrediction( void* callbackDataPtr, int argc, char** argv, char** column ) { // cast pointer to void back to pointer to CallbackData object CallbackData* dataPtr = static_cast<CallbackData*>(callbackDataPtr); Prediction* predictionPtr = dataPtr->predPtr; size_t maxPredictionSize = dataPtr->predSize; if (predictionPtr->size() > maxPredictionSize) { return 1; } else { if( argc == 2 && strcmp( "word", column[ 0 ] ) == 0 && strcmp( "count", column[ 1 ] ) == 0 ) { predictionPtr->addSuggestion( Suggestion( argv[ argc - 2 ], atof( argv[ argc - 1 ] ) ) ); } else { std::cerr << "Invalid invocation of buildPrediction method!" << std::endl; exit( 1 ); } } return 0; }
void SelectorTest::testSelect(TestDataSuite* tds) { while (tds->hasMoreTestData()) { std::cerr << "Updating strstream: " << strstream->str() << '|' << std::endl << " with: " << tds->getUpdateString() << '|' << std::endl; *strstream << tds->getUpdateString(); std::vector<std::string> selectedTokens; selectedTokens = selector->select(tds->getInputPrediction()); Prediction expected = tds->getOutputPrediction(); CPPUNIT_ASSERT_EQUAL( (size_t)expected.size(), selectedTokens.size() ); std::vector<std::string>::const_iterator actual_it = selectedTokens.begin(); int index = 0; while (actual_it != selectedTokens.end()) { std::cerr << "[expected] " << expected.getSuggestion(index).getWord() << " [actual] " << *actual_it << std::endl; CPPUNIT_ASSERT_EQUAL(expected.getSuggestion(index).getWord(), *actual_it); index++; actual_it++; } contextTracker->update(); tds->nextTestData(); } }
Prediction RecencyPredictor::predict (const size_t max, const char** filter) const { Prediction result; std::string prefix = contextTracker->getPrefix(); logger << INFO << "prefix: " << prefix << endl; if (!prefix.empty()) { // Only build recency prediction if prefix is not empty: when // prefix is empty, all previosly seen tokens are candidates // for prediction. This is not desirable, because it means // that recency prediction reduces to repetion of max previous // tokens (i.e. the prediction would contain the most recent // tokens in reverse order). // Suggestion suggestion; size_t index = 1; std::string token = contextTracker->getToken(index); double prob = 0; while (!token.empty() // context history exhausted && result.size() < max // need only max suggestions && index <= cutoff_threshold // look back only as far as cutoff ) { logger << INFO << "token: " << token << endl; if (token.find(prefix) == 0) { // if token starts with prefix if (token_satisfies_filter (token, prefix, filter)) { // compute probability according to exponential decay // formula // prob = n_0 * exp(-(lambda * (index - 1))); logger << INFO << "probability: " << prob << endl; suggestion.setWord(token); suggestion.setProbability(prob); result.addSuggestion(suggestion); } } index++; token = contextTracker->getToken(index); } } return result; }
bool Prediction::operator== (const Prediction& right) const { // same instance is obviously equal to itself if (&right == this) { return true; } else { if (size() != right.size()) { return false; } else { // need to compare each suggestion bool result = true; size_t i = 0; while (i < size() && result) { if (getSuggestion(i) != right.getSuggestion(i)) { result = false; } i++; } return result; } } }
void PredictorRegistryTest::testNext() { ContextTracker* pointer = static_cast<ContextTracker*>((void*)0xdeadbeef); registry->setContextTracker(pointer); PredictorRegistry::Iterator it = registry->iterator(); Predictor* predictor = 0; while (it.hasNext()) { predictor = it.next(); } // since we've iterated till the end of the predictors list, predictor // is now pointing to the DummyPredictor, so let's test we got the // dummy prediction back Prediction prediction = predictor->predict(20, 0); CPPUNIT_ASSERT(predictor != 0); size_t expected_size = 18; CPPUNIT_ASSERT_EQUAL(expected_size, prediction.size()); CPPUNIT_ASSERT_EQUAL(Suggestion("foo1", 0.99), prediction.getSuggestion(0)); CPPUNIT_ASSERT_EQUAL(Suggestion("foobar6", 0.74), prediction.getSuggestion(17)); }
void NewSmoothedNgramPluginTest::testLearning() { // get pointer to plugin Plugin* plugin = pluginRegistry->iterator().next(); { *stream << "f"; ct->update(); Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), actual.size()); } { *stream << "o"; ct->update(); Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), actual.size()); } { *stream << "o "; ct->update(); Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), actual.size()); CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(0).getWord()); ct->update(); } { *stream << "bar"; ct->update(); Prediction actual = plugin->predict(SIZE, 0); } { *stream << " "; ct->update(); Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size()); CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(0).getWord()); CPPUNIT_ASSERT_EQUAL(std::string("bar"), actual.getSuggestion(1).getWord()); } { *stream << "foobar "; ct->update(); Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size()); CPPUNIT_ASSERT_EQUAL(std::string("foobar"), actual.getSuggestion(0).getWord()); CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(1).getWord()); CPPUNIT_ASSERT_EQUAL(std::string("bar"), actual.getSuggestion(2).getWord()); } { *stream << "f"; ct->update(); Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size()); CPPUNIT_ASSERT_EQUAL(std::string("foobar"), actual.getSuggestion(0).getWord()); CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(1).getWord()); } }
void NewSmoothedNgramPluginTest::testFilter() { // get pointer to plugin Plugin* plugin = pluginRegistry->iterator().next(); std::vector<std::string> change; change.push_back("foo"); change.push_back("bar"); change.push_back("foobar"); change.push_back("foz"); change.push_back("baz"); change.push_back("fozbaz"); change.push_back("roo"); change.push_back("rar"); change.push_back("roobar"); // Learn some context so that we have data to create non-empty // predictions // plugin->learn(change); // Alternatively, plugin could have learnt thus... // *stream << "foo bar foobar foz baz fozbaz roo rar roobar "; // ct->update(); { Prediction actual = plugin->predict(SIZE, 0); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(9), actual.size()); } { const char* filters[] = {"f", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(4), actual.size()); } { const char* filters[] = {"b", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size()); } { const char* filters[] = {"r", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size()); } { const char* filters[] = {"f", "b", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(6), actual.size()); } { const char* filters[] = {"f", "r", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(7), actual.size()); } { const char* filters[] = {"f", "b", "r", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(9), actual.size()); } { const char* filters[] = {"fo", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(4), actual.size()); } { const char* filters[] = {"foo", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size()); } { const char* filters[] = {"fo", "ba", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(6), actual.size()); } { const char* filters[] = {"fo", "ba", "ro", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(8), actual.size()); } { const char* filters[] = {"foo", "bar", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size()); } { const char* filters[] = {"foobar", "fozba", "roo", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(4), actual.size()); } { const char* filters[] = {"foobar", "fozbaz", "roobar", 0}; Prediction actual = plugin->predict(SIZE, filters); CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size()); } }
Prediction SmoothedCountPlugin::predict(const size_t max_partial_predictions_size, const char** filter) const { // get w_2, w_1, and prefix from HistoryTracker object std::string prefix = strtolower( contextTracker->getPrefix() ); std::string word_1 = strtolower( contextTracker->getToken(1) ); std::string word_2 = strtolower( contextTracker->getToken(2) ); std::string query; // string used to build sql query int result; // database interrogation diagnostic CallbackData data; // data to pass through to callback function // get most likely unigrams whose w contains prefix Prediction predUnigrams; data.predPtr = &predUnigrams; data.predSize = MAX_PARTIAL_PREDICTION_SIZE; query = "SELECT word, count " "FROM _1_gram " "WHERE word LIKE \"" + prefix + "%\" " "ORDER BY count DESC;"; #if defined(HAVE_SQLITE3_H) result = sqlite3_exec( #elif defined(HAVE_SQLITE_H) result = sqlite_exec( #endif db, query.c_str(), buildPrediction, &data, NULL ); assert(result == SQLITE_OK); // get most likely bigrams having matching w_1 whose w contains prefix Prediction predBigrams; data.predPtr = &predBigrams; query = "SELECT word, count " "FROM _2_gram " "WHERE word_1 = \"" + word_1 + "\" " "AND word LIKE \"" + prefix + "\" " "ORDER BY count DESC;"; #if defined(HAVE_SQLITE3_H) result = sqlite3_exec( #elif defined(HAVE_SQLITE_H) result = sqlite_exec( #endif db, query.c_str(), buildPrediction, &data, NULL ); assert(result == SQLITE_OK); // get most likely trigrams having matching w_2, w_1 whose w contains prefix Prediction predTrigrams; data.predPtr = &predTrigrams; query = "SELECT word, count " "FROM _3_gram " "WHERE word_2 = \"" + word_2 + "\" " "AND word_1 = \"" + word_1 + "\" " "AND word LIKE \"" + prefix + "\" " "ORDER BY count DESC;"; #if defined(HAVE_SQLITE3_H) result = sqlite3_exec( #elif defined(HAVE_SQLITE_H) result = sqlite_exec( #endif db, query.c_str(), buildPrediction, &data, NULL ); assert(result == SQLITE_OK); Prediction p; // combined result of uni/bi/tri gram predictions std::string word; // pivot unigram word (used in next for loop) double ccount; // combined count // compute smoothed probability estimation // TODO !!!!!!!! Everything should be scaled down to probabilities!!! // TODO That means that counts should be scaled down to values between // TODO 0 and 1. We need total word count to do that. // TODO : after correct word has been found in inner loops, execution // TODO : can break out of it. for (size_t i = 0; i < predUnigrams.size(); i++) { word = predUnigrams.getSuggestion( i ).getWord(); ccount = unigram_weight * predUnigrams.getSuggestion( i ).getProbability(); for (size_t j = 0; j < predBigrams.size(); j++) { if( predBigrams.getSuggestion(j).getWord() == word ) { for (size_t k = 0; k < predTrigrams.size(); k++ ) { if( predTrigrams.getSuggestion(k).getWord() == word ) { ccount += trigram_weight * predTrigrams.getSuggestion(k).getProbability(); } } ccount += bigram_weight * predBigrams.getSuggestion(j).getProbability(); } } p.addSuggestion( Suggestion( word, ccount ) ); } return p; // Return combined prediction }