/** SQLite callback function
    Builds prediction from query results.

*/
int buildPrediction( void* callbackDataPtr,
		     int argc,
		     char** argv,
		     char** column )
{
	// cast pointer to void back to pointer to CallbackData object
	CallbackData* dataPtr = static_cast<CallbackData*>(callbackDataPtr);

	Prediction* predictionPtr = dataPtr->predPtr;
	size_t maxPredictionSize = dataPtr->predSize;

	if (predictionPtr->size() > maxPredictionSize) {
		return 1;
	} else {

		if( argc == 2 &&
		    strcmp( "word", column[ 0 ] ) == 0 &&
		    strcmp( "count", column[ 1 ] ) == 0 ) {
			
			predictionPtr->addSuggestion( 
				Suggestion( argv[ argc - 2 ],
					    atof( argv[ argc - 1 ] )
					)
				);
			
		} else {
			std::cerr << "Invalid invocation of buildPrediction method!"
				  << std::endl;
			exit( 1 );
		}
	}
	return 0;
}
void SelectorTest::testSelect(TestDataSuite* tds)
{
    while (tds->hasMoreTestData()) {
	std::cerr << "Updating strstream: " << strstream->str() << '|' << std::endl
		  << " with: " << tds->getUpdateString() << '|' << std::endl;
	*strstream << tds->getUpdateString();
	std::vector<std::string> selectedTokens;
	selectedTokens = selector->select(tds->getInputPrediction());

	Prediction expected = tds->getOutputPrediction();
	CPPUNIT_ASSERT_EQUAL( (size_t)expected.size(), selectedTokens.size() );

	std::vector<std::string>::const_iterator actual_it = selectedTokens.begin();
	int index = 0;
	while (actual_it != selectedTokens.end()) {
	    std::cerr << "[expected] " << expected.getSuggestion(index).getWord()
		      << "  [actual] " << *actual_it << std::endl;
	    CPPUNIT_ASSERT_EQUAL(expected.getSuggestion(index).getWord(),
				 *actual_it);
	    
	    index++;
	    actual_it++;
	}

	contextTracker->update();
	tds->nextTestData();
    }
}
Prediction RecencyPredictor::predict (const size_t max, const char** filter) const
{
    Prediction result;

    std::string prefix = contextTracker->getPrefix();
    logger << INFO << "prefix: " << prefix << endl;
    if (!prefix.empty()) {
        // Only build recency prediction if prefix is not empty: when
        // prefix is empty, all previosly seen tokens are candidates
        // for prediction. This is not desirable, because it means
        // that recency prediction reduces to repetion of max previous
        // tokens (i.e. the prediction would contain the most recent
        // tokens in reverse order).
        //
        Suggestion  suggestion;
        size_t      index = 1;
        std::string token = contextTracker->getToken(index);
	double      prob = 0;
        while (!token.empty()                // context history exhausted
	       && result.size() < max        // need only max suggestions
	       && index <= cutoff_threshold  // look back only as far as cutoff
	    ) {
	    logger << INFO << "token: " << token << endl;

            if (token.find(prefix) == 0) { // if token starts with prefix

		if (token_satisfies_filter (token, prefix, filter)) {
		    // compute probability according to exponential decay
		    // formula
		    //
		    prob = n_0 * exp(-(lambda * (index - 1)));
		    logger << INFO << "probability: " << prob << endl;
		    suggestion.setWord(token);
		    suggestion.setProbability(prob);
		    result.addSuggestion(suggestion);
		}

            }

            index++;
            token = contextTracker->getToken(index);
        }
    }

    return result;
}
Beispiel #4
0
bool Prediction::operator== (const Prediction& right) const
{
    // same instance is obviously equal to itself
    if (&right == this) {
	return true;
    } else {
	if (size() != right.size()) {
	    return false;
	} else {
	    // need to compare each suggestion
	    bool result = true;
	    size_t i = 0;
	    while (i < size() && result) {
		if (getSuggestion(i) != right.getSuggestion(i)) {
		    result = false;
		}
		i++;
	    }
	    return result;
	}
    }
}
void PredictorRegistryTest::testNext()
{
    ContextTracker* pointer = static_cast<ContextTracker*>((void*)0xdeadbeef);
    registry->setContextTracker(pointer);

    PredictorRegistry::Iterator it = registry->iterator();
    Predictor* predictor = 0;

    while (it.hasNext()) {
	predictor = it.next();
    }

    // since we've iterated till the end of the predictors list, predictor
    // is now pointing to the DummyPredictor, so let's test we got the
    // dummy prediction back
    Prediction prediction = predictor->predict(20, 0);

    CPPUNIT_ASSERT(predictor != 0);
    size_t expected_size = 18;
    CPPUNIT_ASSERT_EQUAL(expected_size, prediction.size());
    CPPUNIT_ASSERT_EQUAL(Suggestion("foo1", 0.99), prediction.getSuggestion(0));
    CPPUNIT_ASSERT_EQUAL(Suggestion("foobar6", 0.74), prediction.getSuggestion(17));
}
void NewSmoothedNgramPluginTest::testLearning()
{
    // get pointer to plugin
    Plugin* plugin = pluginRegistry->iterator().next();

    {
	*stream << "f";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), actual.size());
    }

    {
	*stream << "o";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(0), actual.size());
    }

    {
	*stream << "o ";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(1), actual.size());
	CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(0).getWord());
	ct->update();
    }

    {
	*stream << "bar";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
    }

    {
	*stream << " ";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size());
	CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(0).getWord());
	CPPUNIT_ASSERT_EQUAL(std::string("bar"), actual.getSuggestion(1).getWord());
    }

    {
	*stream << "foobar ";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size());
	CPPUNIT_ASSERT_EQUAL(std::string("foobar"), actual.getSuggestion(0).getWord());
	CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(1).getWord());
	CPPUNIT_ASSERT_EQUAL(std::string("bar"), actual.getSuggestion(2).getWord());
    }

    {
	*stream << "f";
	ct->update();
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size());
	CPPUNIT_ASSERT_EQUAL(std::string("foobar"), actual.getSuggestion(0).getWord());
	CPPUNIT_ASSERT_EQUAL(std::string("foo"), actual.getSuggestion(1).getWord());
    }
}
void NewSmoothedNgramPluginTest::testFilter()
{
    // get pointer to plugin
    Plugin* plugin = pluginRegistry->iterator().next();

    std::vector<std::string> change;
    change.push_back("foo");
    change.push_back("bar");
    change.push_back("foobar");
    change.push_back("foz");
    change.push_back("baz");
    change.push_back("fozbaz");
    change.push_back("roo");
    change.push_back("rar");
    change.push_back("roobar");

    // Learn some context so that we have data to create non-empty
    // predictions
    // 
    plugin->learn(change);

    // Alternatively, plugin could have learnt thus...
    //    *stream << "foo bar foobar foz baz fozbaz roo rar roobar ";
    //    ct->update();

    {
	Prediction actual = plugin->predict(SIZE, 0);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(9), actual.size());
    }

    {
	const char* filters[] = {"f", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(4), actual.size());
    }

    {
	const char* filters[] = {"b", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size());
    }

    {
	const char* filters[] = {"r", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size());
    }

    {
	const char* filters[] = {"f", "b", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(6), actual.size());
    }

    {
	const char* filters[] = {"f", "r", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(7), actual.size());
    }

    {
	const char* filters[] = {"f", "b", "r", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(9), actual.size());
    }

    {
	const char* filters[] = {"fo", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(4), actual.size());
    }

    {
	const char* filters[] = {"foo", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(2), actual.size());
    }

    {
	const char* filters[] = {"fo", "ba", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(6), actual.size());
    }

    {
	const char* filters[] = {"fo", "ba", "ro", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(8), actual.size());
    }

    {
	const char* filters[] = {"foo", "bar", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size());
    }

    {
	const char* filters[] = {"foobar", "fozba", "roo", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(4), actual.size());
    }

    {
	const char* filters[] = {"foobar", "fozbaz", "roobar", 0};
	Prediction actual = plugin->predict(SIZE, filters);
	CPPUNIT_ASSERT_EQUAL(static_cast<size_t>(3), actual.size());
    }

}
Prediction SmoothedCountPlugin::predict(const size_t max_partial_predictions_size, const char** filter) const
{
    // get w_2, w_1, and prefix from HistoryTracker object
    std::string prefix = strtolower( contextTracker->getPrefix() );
    std::string word_1 = strtolower( contextTracker->getToken(1) );
    std::string word_2 = strtolower( contextTracker->getToken(2) );
    
    std::string query; // string used to build sql query
    int result;        // database interrogation diagnostic
    CallbackData data; // data to pass through to callback function
	

    // get most likely unigrams whose w contains prefix
    Prediction predUnigrams;
    
    data.predPtr = &predUnigrams;
    data.predSize = MAX_PARTIAL_PREDICTION_SIZE;
    
    query = 
	"SELECT word, count "
	"FROM _1_gram "
	"WHERE word LIKE \"" + prefix + "%\" "
	"ORDER BY count DESC;";
    
#if defined(HAVE_SQLITE3_H)
    result = sqlite3_exec(
#elif defined(HAVE_SQLITE_H)
    result = sqlite_exec(
#endif
	db,
	query.c_str(),
	buildPrediction,
	&data,
	NULL
    );
    assert(result == SQLITE_OK);


    // get most likely bigrams having matching w_1 whose w contains prefix
    Prediction predBigrams;
    
    data.predPtr = &predBigrams;
    
    query = 
    "SELECT word, count "
    "FROM _2_gram "
    "WHERE word_1 = \"" + word_1 + "\" "
    "AND word LIKE \"" + prefix + "\" "
    "ORDER BY count DESC;";
    
#if defined(HAVE_SQLITE3_H)
    result = sqlite3_exec(
#elif defined(HAVE_SQLITE_H)
    result = sqlite_exec(
#endif
	db,
	query.c_str(),
	buildPrediction,
	&data,
	NULL
    );
    assert(result == SQLITE_OK);


    // get most likely trigrams having matching w_2, w_1 whose w contains prefix
    Prediction predTrigrams;
    
    data.predPtr = &predTrigrams;
    
    query = 
    "SELECT word, count "
    "FROM _3_gram "
    "WHERE word_2 = \"" + word_2 + "\" "
    "AND word_1 = \"" + word_1 + "\" "
    "AND word LIKE \"" + prefix + "\" "
    "ORDER BY count DESC;";
    
#if defined(HAVE_SQLITE3_H)
    result = sqlite3_exec(
#elif defined(HAVE_SQLITE_H)
    result = sqlite_exec(
#endif
	db,
	query.c_str(),
	buildPrediction,
	&data,
	NULL
    );
    assert(result == SQLITE_OK);
	
    
    Prediction p;     // combined result of uni/bi/tri gram predictions
    std::string word; // pivot unigram word (used in next for loop)
    double ccount;    // combined count
    
    // compute smoothed probability estimation
    
    // TODO !!!!!!!! Everything should be scaled down to probabilities!!!
    // TODO That means that counts should be scaled down to values between
    // TODO 0 and 1. We need total word count to do that.
    
    // TODO : after correct word has been found in inner loops, execution
    // TODO : can break out of it.
    for (size_t i = 0; i < predUnigrams.size(); i++) {

	word   = predUnigrams.getSuggestion( i ).getWord();
	ccount = unigram_weight *
	    predUnigrams.getSuggestion( i ).getProbability();
	
	for (size_t j = 0; j < predBigrams.size(); j++) {

	    if( predBigrams.getSuggestion(j).getWord() == word ) {
		
		for (size_t k = 0; k < predTrigrams.size(); k++ ) {
		    
		    if( predTrigrams.getSuggestion(k).getWord() == word ) {
			
			ccount += trigram_weight *
			    predTrigrams.getSuggestion(k).getProbability();
			
		    }
		}
		
		ccount += bigram_weight *
		    predBigrams.getSuggestion(j).getProbability();
		
	    }
	    
	}
	
	p.addSuggestion( Suggestion( word, ccount ) );
	
    }

    return p; // Return combined prediction
}