// ** Films::similarTo SimilarFilmsArray Films::similarTo( const std::string& oid, int count ) const { SimilarFilmsArray result; OID objectId = OID( oid ); CollectionPtr similars = m_target->collection( "similar" ); CursorPtr cursor = similars->find( QUERY( "$or" << ARRAY( DOCUMENT( "first" << oid ) << DOCUMENT( "second" << objectId ) ) ) ); DocumentPtr document; while( (document = cursor->next()) ) { // ** Read data from document OID first = document->objectId( "first" ); OID second = document->objectId( "second" ); int value = document->integer( "value" ); // ** Decode similarity & accuracy float similarity, accuracy; decodeSimilarity( value, similarity, accuracy ); // ** Push similar film Film film = filmById( first == objectId ? second : first ); Quality quality = qualityFromRange( similarity, m_similarityQuartiles ); result.push_back( SimilarFilm( film, similarity, accuracy, quality ) ); } std::sort( result.begin(), result.end(), SimilarFilm::sortBySimilarity ); if( count ) { result.resize( std::min( count, ( int )result.size() ) ); } return result; }
// ** Films::processFilms void Films::processFilms( float sharedThreshold ) { CursorPtr films = m_source->collection( "items" )->find(); CollectionPtr processed = m_target->collection( "items" ); CollectionPtr similar = m_target->collection( "similar" ); IRecommenderItems* items = new StreamedFilms( *this, m_target ); SimilarityPtr similarity = SimilarityPtr( new JaccardAccuracySimilarity( NumericFeatures::pearson, 0.0f, 0.0f, sharedThreshold ) ); Recommender recommender = Recommender( this, similarity.get() ); DocumentPtr document; int counter = 0; int total = m_source->collection( "items" )->count(); while( (document = films->next()) ) { // ** Read the item id int itemId = document->integer( "itemId" ); // ** Print progress if( (++counter % 10) == 0 ) printf( "Processing films [%d/%d]\n", counter, total ); // ** Skip processed films if( processed->findOne( QUERY( "itemId" << itemId ) ) != NULL ) { continue; } // ** Load first item items->first(); // ** Compute and store similarities RecommenderItem item = findById( itemId ); SimilarItems pairs = recommender.findSimilarItems( items, &item ); OID oid = filmIdToObjectId( itemId ); for( SimilarItems::const_iterator i = pairs.begin(), end = pairs.end(); i != end; ++i ) { similar->insert( DOCUMENT( "first" << oid << "second" << filmIdToObjectId( i->m_item ) << "value" << ( int )encodeSimilarity( i->m_similarity, i->m_shared ) ) ); } // ** Store processed film processed->insert( DOCUMENT( "itemId" << itemId ) ); } // ** Build indices similar->ensureIndex( "idxFirst", DOCUMENT( "first" << 1 ) ); similar->ensureIndex( "idxSecond", DOCUMENT( "second" << 1 ) ); // ** Delete iterator delete items; }
// ** Films::objectIdToFilmId int Films::objectIdToFilmId( const OID& oid ) const { int result = -1; CollectionPtr items = m_source->collection( "items" ); DocumentPtr item = items->findOne( QUERY( "_id" << oid ) ); assert( item != NULL ); if( item != NULL ) { result = item->integer( "itemId" ); } return result; }
// ** PreloadedFilms::PreloadedFilms PreloadedFilms::PreloadedFilms( const Films& films, const CursorPtr& cursor ) { do { DocumentPtr document = cursor->next(); if( document == NULL ) { break; } RecommenderItem* item = new RecommenderItem; item->m_userData = new OID( *document->_id().value() ); item->m_id = document->integer( "itemId" ); item->m_name = document->string( "name.ru" ); item->m_featureSpaces["votes"] = films.votesForFilm( item->m_id ); m_items[item->m_id] = item; } while( true ); }
// ** Films::updateVotesCount void Films::updateVotesCount( void ) { CollectionPtr items = m_source->collection( "items" ); CollectionPtr votes = m_source->collection( "votes" ); CursorPtr cursor = items->find(); DocumentPtr document; IntegerSamples samples; int progress = 0; int total = items->count(); while( (document = cursor->next()) ) { int itemId = document->integer( "itemId" ); int count = votes->count( QUERY( "itemId" << itemId ) ); items->update( QUERY( "itemId" << itemId ), DOCUMENT( "$set" << DOCUMENT( "votesCount" << count ) ) ); printf( "Updating votes count [%d/%d]...\n", ++progress, total ); } }
// ** Films::showStats void Films::showStats( void ) const { CollectionPtr items = m_source->collection( "items" ); CursorPtr cursor = items->find(); DocumentPtr document; IntegerSamples countSamples; while( (document = cursor->next()) ) { countSamples += document->integer( "votesCount" ); } IntegerArray quartiles = countSamples.quartiles(); printf( "Votes: min %d, max %d, average amount %d, medean %d, quartiles %d/%d/%d\n", countSamples.min(), countSamples.max(), countSamples.mean(), countSamples.median(), quartiles[0], quartiles[1], quartiles[2] ); for( int i = 2; i <= 7; i++ ) { int amount = pow( 10, i ); printf( "%d items has greater than %d votes\n", countSamples.greaterCount( amount ), amount ); } }
// ** Films::votesForFilm NumericFeatures Films::votesForFilm( int filmId ) const { NumericFeatures result; CollectionPtr votes = m_source->collection( "votes" ); CursorPtr cursor = votes->find( QUERY( "itemId" << filmId ) ); DocumentPtr document; while( (document = cursor->next()) ) { printf( "get\n" ); result.set( document->integer( "userId" ), document->number( "rating" ) ); printf( "v %d\n", result.size() ); // sleep(1); printf( "next\n" ); } printf( "done! %d\n", result.size() ); return result; }
// ** Films::updateSharedAndSimilarityRanges void Films::updateSharedAndSimilarityRanges( void ) { CollectionPtr similar = m_target->collection( "similar" ); CursorPtr cursor = similar->find(); int count = similar->count(); int progress = 0; DocumentPtr document; FloatSamples similaritySamples, sharedSamples; while( (document = cursor->next()) ) { float similarity, shared; decodeSimilarity( document->integer( "value" ), similarity, shared ); similaritySamples += similarity; sharedSamples += shared; if( (++progress % 1000) == 0 ) printf( "Updating ranges [%d/%d]\n", progress, count ); } m_target->collection( "info" )->upsert( QUERY( "type" << "similarity" ), DOCUMENT( "type" << "similarity" << "shared" << sharedSamples.quartiles() << "similarity" << similaritySamples.quartiles() ) ); }
// ** StreamedFilms::documentToItem void StreamedFilms::documentToItem( RecommenderItem& item, const DocumentPtr& document ) const { item.m_id = document->integer( "itemId" ); item.m_featureSpaces["votes"] = m_films.votesForFilm( item.m_id ); }
// ** Films::filmFromDocument Film Films::filmFromDocument( const DocumentPtr& document ) const { Film film( document->_id(), document->string( "name.ru" ), document->integerSet( "genres" ), document->integer( "year" ) ); film.m_video = document->string( "video" ); return film; }