inline force_inline // Absolutely MUST be inline so optimizations can happen. double NeighborSearchRules<SortPolicy, MetricType, TreeType>:: BaseCase(const size_t queryIndex, const size_t referenceIndex) { // If the datasets are the same, then this search is only using one dataset // and we should not return identical points. if ((&querySet == &referenceSet) && (queryIndex == referenceIndex)) return 0.0; // If we have already performed this base case, then do not perform it again. if ((lastQueryIndex == queryIndex) && (lastReferenceIndex == referenceIndex)) return lastBaseCase; double distance = metric.Evaluate(querySet.col(queryIndex), referenceSet.col(referenceIndex)); ++baseCases; // If this distance is better than any of the current candidates, the // SortDistance() function will give us the position to insert it into. arma::vec queryDist = distances.unsafe_col(queryIndex); arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex); const size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices, distance); // SortDistance() returns (size_t() - 1) if we shouldn't add it. if (insertPosition != (size_t() - 1)) InsertNeighbor(queryIndex, insertPosition, referenceIndex, distance); // Cache this information for the next time BaseCase() is called. lastQueryIndex = queryIndex; lastReferenceIndex = referenceIndex; lastBaseCase = distance; return distance; }
inline force_inline double LSHSearch<SortPolicy>::BaseCase(arma::mat& distances, arma::Mat<size_t>& neighbors, const size_t queryIndex, const size_t referenceIndex) { // If the datasets are the same, then this search is only using one dataset // and we should not return identical points. if ((&querySet == &referenceSet) && (queryIndex == referenceIndex)) return 0.0; const double distance = metric::EuclideanDistance::Evaluate( querySet.unsafe_col(queryIndex), referenceSet.unsafe_col(referenceIndex)); // If this distance is better than any of the current candidates, the // SortDistance() function will give us the position to insert it into. arma::vec queryDist = distances.unsafe_col(queryIndex); arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex); size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices, distance); // SortDistance() returns (size_t() - 1) if we shouldn't add it. if (insertPosition != (size_t() - 1)) InsertNeighbor(distances, neighbors, queryIndex, insertPosition, referenceIndex, distance); return distance; }
inline force_inline double FastMKSRules<KernelType, TreeType>::BaseCase( const size_t queryIndex, const size_t referenceIndex) { // Score() always happens before BaseCase() for a given node combination. For // cover trees, the kernel evaluation between the two centroid points already // happened. So we don't need to do it. Note that this optimizes out if the // first conditional is false (its result is known at compile time). if (tree::TreeTraits<TreeType>::FirstPointIsCentroid) { if ((queryIndex == lastQueryIndex) && (referenceIndex == lastReferenceIndex)) return lastKernel; // Store new values. lastQueryIndex = queryIndex; lastReferenceIndex = referenceIndex; } ++baseCases; double kernelEval = kernel.Evaluate(querySet.unsafe_col(queryIndex), referenceSet.unsafe_col(referenceIndex)); // Update the last kernel value, if we need to. if (tree::TreeTraits<TreeType>::FirstPointIsCentroid) lastKernel = kernelEval; // If the reference and query sets are identical, we still need to compute the // base case (so that things can be bounded properly), but we won't add it to // the results. if ((&querySet == &referenceSet) && (queryIndex == referenceIndex)) return kernelEval; // If this is a better candidate, insert it into the list. if (kernelEval < products(products.n_rows - 1, queryIndex)) return kernelEval; size_t insertPosition = 0; for ( ; insertPosition < products.n_rows; ++insertPosition) if (kernelEval >= products(insertPosition, queryIndex)) break; InsertNeighbor(queryIndex, insertPosition, referenceIndex, kernelEval); return kernelEval; }
inline force_inline // Absolutely MUST be inline so optimizations can happen. double NeighborSearchRules<SortPolicy, MetricType, TreeType>:: BaseCase(const size_t queryIndex, const size_t referenceIndex) { // If the datasets are the same, then this search is only using one dataset // and we should not return identical points. if ((&querySet == &referenceSet) && (queryIndex == referenceIndex)) return 0.0; double distance = metric.Evaluate(querySet.unsafe_col(queryIndex), referenceSet.unsafe_col(referenceIndex)); // If this distance is better than any of the current candidates, the // SortDistance() function will give us the position to insert it into. arma::vec queryDist = distances.unsafe_col(queryIndex); size_t insertPosition = SortPolicy::SortDistance(queryDist, distance); // SortDistance() returns (size_t() - 1) if we shouldn't add it. if (insertPosition != (size_t() - 1)) InsertNeighbor(queryIndex, insertPosition, referenceIndex, distance); return distance; }
void CF<FactorizerType>::GetRecommendations(const size_t numRecs, arma::Mat<size_t>& recommendations, arma::Col<size_t>& users) { // Generate new table by multiplying approximate values. rating = w * h; // Now, we will use the decomposed w and h matrices to estimate what the user // would have rated items as, and then pick the best items. // Temporarily store feature vector of queried users. arma::mat query(rating.n_rows, users.n_elem); // Select feature vectors of queried users. for (size_t i = 0; i < users.n_elem; i++) query.col(i) = rating.col(users(i)); // Temporary storage for neighborhood of the queried users. arma::Mat<size_t> neighborhood; // Calculate the neighborhood of the queried users. // This should be a templatized option. neighbor::AllkNN a(rating); arma::mat resultingDistances; // Temporary storage. a.Search(query, numUsersForSimilarity, neighborhood, resultingDistances); // Temporary storage for storing the average rating for each user in their // neighborhood. arma::mat averages = arma::zeros<arma::mat>(rating.n_rows, query.n_cols); // Iterate over each query user. for (size_t i = 0; i < neighborhood.n_cols; ++i) { // Iterate over each neighbor of the query user. for (size_t j = 0; j < neighborhood.n_rows; ++j) averages.col(i) += rating.col(neighborhood(j, i)); // Normalize average. averages.col(i) /= neighborhood.n_rows; } // Generate recommendations for each query user by finding the maximum numRecs // elements in the averages matrix. recommendations.set_size(numRecs, users.n_elem); recommendations.fill(cleanedData.n_rows); // Invalid item number. arma::mat values(numRecs, users.n_elem); values.fill(-DBL_MAX); // The smallest possible value. for (size_t i = 0; i < users.n_elem; i++) { // Look through the averages column corresponding to the current user. for (size_t j = 0; j < averages.n_rows; ++j) { // Ensure that the user hasn't already rated the item. if (cleanedData(j, users(i)) != 0.0) continue; // The user already rated the item. // Is the estimated value better than the worst candidate? const double value = averages(j, i); if (value > values(values.n_rows - 1, i)) { // It should be inserted. Which position? size_t insertPosition = values.n_rows - 1; while (insertPosition > 0) { if (value <= values(insertPosition - 1, i)) break; // The current value is the right one. insertPosition--; } // Now insert it into the list. InsertNeighbor(i, insertPosition, j, value, recommendations, values); } } // If we were not able to come up with enough recommendations, issue a // warning. if (recommendations(values.n_rows - 1, i) == cleanedData.n_rows + 1) Log::Warn << "Could not provide " << values.n_rows << " recommendations " << "for user " << users(i) << " (not enough un-rated items)!" << std::endl; } }
void CF::GetRecommendations(const size_t numRecs, arma::Mat<size_t>& recommendations, arma::Col<size_t>& users) { // We want to avoid calculating the full rating matrix, so we will do nearest // neighbor search only on the H matrix, using the observation that if the // rating matrix X = W*H, then d(X.col(i), X.col(j)) = d(W H.col(i), W // H.col(j)). This can be seen as nearest neighbor search on the H matrix // with the Mahalanobis distance where M^{-1} = W^T W. So, we'll decompose // M^{-1} = L L^T (the Cholesky decomposition), and then multiply H by L^T. // Then we can perform nearest neighbor search. arma::mat l = arma::chol(w.t() * w); arma::mat stretchedH = l * h; // Due to the Armadillo API, l is L^T. // Now, we will use the decomposed w and h matrices to estimate what the user // would have rated items as, and then pick the best items. // Temporarily store feature vector of queried users. arma::mat query(stretchedH.n_rows, users.n_elem); // Select feature vectors of queried users. for (size_t i = 0; i < users.n_elem; i++) query.col(i) = stretchedH.col(users(i)); // Temporary storage for neighborhood of the queried users. arma::Mat<size_t> neighborhood; // Calculate the neighborhood of the queried users. // This should be a templatized option. neighbor::KNN a(stretchedH); arma::mat resultingDistances; // Temporary storage. a.Search(query, numUsersForSimilarity, neighborhood, resultingDistances); // Generate recommendations for each query user by finding the maximum numRecs // elements in the averages matrix. recommendations.set_size(numRecs, users.n_elem); recommendations.fill(cleanedData.n_rows); // Invalid item number. arma::mat values(numRecs, users.n_elem); values.fill(-DBL_MAX); // The smallest possible value. for (size_t i = 0; i < users.n_elem; i++) { // First, calculate average of neighborhood values. arma::vec averages; averages.zeros(cleanedData.n_rows); for (size_t j = 0; j < neighborhood.n_rows; ++j) averages += w * h.col(neighborhood(j, i)); averages /= neighborhood.n_rows; // Look through the averages column corresponding to the current user. for (size_t j = 0; j < averages.n_rows; ++j) { // Ensure that the user hasn't already rated the item. if (cleanedData(j, users(i)) != 0.0) continue; // The user already rated the item. // Is the estimated value better than the worst candidate? const double value = averages[j]; if (value > values(values.n_rows - 1, i)) { // It should be inserted. Which position? size_t insertPosition = values.n_rows - 1; while (insertPosition > 0) { if (value <= values(insertPosition - 1, i)) break; // The current value is the right one. insertPosition--; } // Now insert it into the list. InsertNeighbor(i, insertPosition, j, value, recommendations, values); } } // If we were not able to come up with enough recommendations, issue a // warning. if (recommendations(values.n_rows - 1, i) == cleanedData.n_rows + 1) Log::Warn << "Could not provide " << values.n_rows << " recommendations " << "for user " << users(i) << " (not enough un-rated items)!" << std::endl; } }
void FastMKS<KernelType, TreeType>::Search( const typename TreeType::Mat& querySet, const size_t k, arma::Mat<size_t>& indices, arma::mat& kernels) { Timer::Start("computing_products"); // No remapping will be necessary because we are using the cover tree. indices.set_size(k, querySet.n_cols); kernels.set_size(k, querySet.n_cols); // Naive implementation. if (naive) { // Fill kernels. kernels.fill(-DBL_MAX); // Simple double loop. Stupid, slow, but a good benchmark. for (size_t q = 0; q < querySet.n_cols; ++q) { for (size_t r = 0; r < referenceSet.n_cols; ++r) { const double eval = metric.Kernel().Evaluate(querySet.col(q), referenceSet.col(r)); size_t insertPosition; for (insertPosition = 0; insertPosition < indices.n_rows; ++insertPosition) if (eval > kernels(insertPosition, q)) break; if (insertPosition < indices.n_rows) InsertNeighbor(indices, kernels, q, insertPosition, r, eval); } } Timer::Stop("computing_products"); return; } // Single-tree implementation. if (singleMode) { // Fill kernels. kernels.fill(-DBL_MAX); // Create rules object (this will store the results). This constructor // precalculates each self-kernel value. typedef FastMKSRules<KernelType, TreeType> RuleType; RuleType rules(referenceSet, querySet, indices, kernels, metric.Kernel()); typename TreeType::template SingleTreeTraverser<RuleType> traverser(rules); for (size_t i = 0; i < querySet.n_cols; ++i) traverser.Traverse(i, *referenceTree); Log::Info << rules.BaseCases() << " base cases." << std::endl; Log::Info << rules.Scores() << " scores." << std::endl; Timer::Stop("computing_products"); return; } // Dual-tree implementation. First, we need to build the query tree. We are // assuming it doesn't map anything... Timer::Stop("computing_products"); Timer::Start("tree_building"); TreeType queryTree(querySet); Timer::Stop("tree_building"); Search(&queryTree, k, indices, kernels); }
void FastMKS<KernelType, TreeType>::Search(const size_t k, arma::Mat<size_t>& indices, arma::mat& kernels) { // No remapping will be necessary because we are using the cover tree. Timer::Start("computing_products"); indices.set_size(k, referenceSet.n_cols); kernels.set_size(k, referenceSet.n_cols); kernels.fill(-DBL_MAX); // Naive implementation. if (naive) { // Simple double loop. Stupid, slow, but a good benchmark. for (size_t q = 0; q < referenceSet.n_cols; ++q) { for (size_t r = 0; r < referenceSet.n_cols; ++r) { if (q == r) continue; // Don't return the point as its own candidate. const double eval = metric.Kernel().Evaluate(referenceSet.col(q), referenceSet.col(r)); size_t insertPosition; for (insertPosition = 0; insertPosition < indices.n_rows; ++insertPosition) if (eval > kernels(insertPosition, q)) break; if (insertPosition < indices.n_rows) InsertNeighbor(indices, kernels, q, insertPosition, r, eval); } } Timer::Stop("computing_products"); return; } // Single-tree implementation. if (singleMode) { // Create rules object (this will store the results). This constructor // precalculates each self-kernel value. typedef FastMKSRules<KernelType, TreeType> RuleType; RuleType rules(referenceSet, referenceSet, indices, kernels, metric.Kernel()); typename TreeType::template SingleTreeTraverser<RuleType> traverser(rules); for (size_t i = 0; i < referenceSet.n_cols; ++i) traverser.Traverse(i, *referenceTree); // Save the number of pruned nodes. const size_t numPrunes = traverser.NumPrunes(); Log::Info << "Pruned " << numPrunes << " nodes." << std::endl; Log::Info << rules.BaseCases() << " base cases." << std::endl; Log::Info << rules.Scores() << " scores." << std::endl; Timer::Stop("computing_products"); return; } // Dual-tree implementation. Timer::Stop("computing_products"); Search(referenceTree, k, indices, kernels); }