DualTreeKMeansStatistic(TreeType& node) : neighbor::NeighborSearchStat<neighbor::NearestNeighborSort>(), upperBound(DBL_MAX), lowerBound(DBL_MAX), owner(size_t(-1)), pruned(size_t(-1)), staticPruned(false), staticUpperBoundMovement(0.0), staticLowerBoundMovement(0.0), trueParent(node.Parent()) { // Empirically calculate the centroid. centroid.zeros(node.Dataset().n_rows); for (size_t i = 0; i < node.NumPoints(); ++i) { // Correct handling of cover tree: don't double-count the point which // appears in the children. if (tree::TreeTraits<TreeType>::HasSelfChildren && i == 0 && node.NumChildren() > 0) continue; centroid += node.Dataset().col(node.Point(i)); } for (size_t i = 0; i < node.NumChildren(); ++i) centroid += node.Child(i).NumDescendants() * node.Child(i).Stat().Centroid(); centroid /= node.NumDescendants(); // Set the true children correctly. trueChildren.resize(node.NumChildren()); for (size_t i = 0; i < node.NumChildren(); ++i) trueChildren[i] = &node.Child(i); }
inline double KDERules<MetricType, KernelType, TreeType>:: Score(const size_t queryIndex, TreeType& referenceNode) { double score, maxKernel, minKernel, bound; const arma::vec& queryPoint = querySet.unsafe_col(queryIndex); const double minDistance = referenceNode.MinDistance(queryPoint); bool newCalculations = true; if (tree::TreeTraits<TreeType>::FirstPointIsCentroid && lastQueryIndex == queryIndex && traversalInfo.LastReferenceNode() != NULL && traversalInfo.LastReferenceNode()->Point(0) == referenceNode.Point(0)) { // Don't duplicate calculations. newCalculations = false; lastQueryIndex = queryIndex; lastReferenceIndex = referenceNode.Point(0); } else { // Calculations are new. maxKernel = kernel.Evaluate(minDistance); minKernel = kernel.Evaluate(referenceNode.MaxDistance(queryPoint)); bound = maxKernel - minKernel; } if (newCalculations && bound <= (absError + relError * minKernel) / referenceSet.n_cols) { // Estimate values. double kernelValue; // Calculate kernel value based on reference node centroid. if (tree::TreeTraits<TreeType>::FirstPointIsCentroid) { kernelValue = EvaluateKernel(queryIndex, referenceNode.Point(0)); } else { kde::KDEStat& referenceStat = referenceNode.Stat(); kernelValue = EvaluateKernel(queryPoint, referenceStat.Centroid()); } densities(queryIndex) += referenceNode.NumDescendants() * kernelValue; // Don't explore this tree branch. score = DBL_MAX; } else { score = minDistance; } ++scores; traversalInfo.LastReferenceNode() = &referenceNode; traversalInfo.LastScore() = score; return score; }
PellegMooreKMeansStatistic(TreeType& node) { centroid.zeros(node.Dataset().n_rows); // Hope it's a depth-first build procedure. Also, this won't work right for // trees that have self-children or stuff like that. for (size_t i = 0; i < node.NumChildren(); ++i) { centroid += node.Child(i).NumDescendants() * node.Child(i).Stat().Centroid(); } for (size_t i = 0; i < node.NumPoints(); ++i) { centroid += node.Dataset().col(node.Point(i)); } if (node.NumDescendants() > 0) centroid /= node.NumDescendants(); else centroid.fill(DBL_MAX); // Invalid centroid. What else can we do? }
void RangeSearchRules<MetricType, TreeType>::AddResult(const size_t queryIndex, TreeType& referenceNode) { // Some types of trees calculate the base case evaluation before Score() is // called, so if the base case has already been calculated, then we must avoid // adding that point to the results again. size_t baseCaseMod = 0; if (tree::TreeTraits<TreeType>::FirstPointIsCentroid && (queryIndex == lastQueryIndex) && (referenceNode.Point(0) == lastReferenceIndex)) { baseCaseMod = 1; } // Resize distances and neighbors vectors appropriately. We have to use // reserve() and not resize(), because we don't know if we will encounter the // case where the datasets and points are the same (and we skip in that case). const size_t oldSize = neighbors[queryIndex].size(); neighbors[queryIndex].reserve(oldSize + referenceNode.NumDescendants() - baseCaseMod); distances[queryIndex].reserve(oldSize + referenceNode.NumDescendants() - baseCaseMod); for (size_t i = baseCaseMod; i < referenceNode.NumDescendants(); ++i) { if ((&referenceSet == &querySet) && (queryIndex == referenceNode.Descendant(i))) continue; const double distance = metric.Evaluate(querySet.unsafe_col(queryIndex), referenceNode.Dataset().unsafe_col(referenceNode.Descendant(i))); neighbors[queryIndex].push_back(referenceNode.Descendant(i)); distances[queryIndex].push_back(distance); } }
DualTreeKMeansStatistic(TreeType& node) : closestQueryNode(NULL), minQueryNodeDistance(DBL_MAX), maxQueryNodeDistance(DBL_MAX), clustersPruned(0), iteration(size_t() - 1) { // Empirically calculate the centroid. centroid.zeros(node.Dataset().n_rows); for (size_t i = 0; i < node.NumPoints(); ++i) centroid += node.Dataset().col(node.Point(i)); for (size_t i = 0; i < node.NumChildren(); ++i) centroid += node.Child(i).NumDescendants() * node.Child(i).Stat().Centroid(); centroid /= node.NumDescendants(); }
void CheckBound(const TreeType& tree) { typedef typename TreeType::ElemType ElemType; for (size_t i = 0; i < tree.NumDescendants(); i++) { arma::Col<ElemType> point = tree.Dataset().col(tree.Descendant(i)); // Check that the point is contained in the bound. BOOST_REQUIRE_EQUAL(true, tree.Bound().Contains(point)); const arma::Mat<ElemType>& loBound = tree.Bound().LoBound(); const arma::Mat<ElemType>& hiBound = tree.Bound().HiBound(); // Ensure that there is a hyperrectangle that contains the point. bool success = false; for (size_t j = 0; j < tree.Bound().NumBounds(); j++) { success = true; for (size_t k = 0; k < loBound.n_rows; k++) { if (point[k] < loBound(k, j) - 1e-14 * std::fabs(loBound(k, j)) || point[k] > hiBound(k, j) + 1e-14 * std::fabs(hiBound(k, j))) { success = false; break; } } if (success) break; } BOOST_REQUIRE_EQUAL(success, true); } if (!tree.IsLeaf()) { CheckBound(*tree.Left()); CheckBound(*tree.Right()); } }
double PellegMooreKMeansRules<MetricType, TreeType>::Score( const size_t /* queryIndex */, TreeType& referenceNode) { // Obtain the parent's blacklist. If this is the root node, we'll start with // an empty blacklist. This means that after each iteration, we don't need to // reset any statistics. if (referenceNode.Parent() == NULL || referenceNode.Parent()->Stat().Blacklist().n_elem == 0) referenceNode.Stat().Blacklist().zeros(centroids.n_cols); else referenceNode.Stat().Blacklist() = referenceNode.Parent()->Stat().Blacklist(); // The query index is a fake index that we won't use, and the reference node // holds all of the points in the dataset. Our goal is to determine whether // or not this node is dominated by a single cluster. const size_t whitelisted = centroids.n_cols - arma::accu(referenceNode.Stat().Blacklist()); distanceCalculations += whitelisted; // Which cluster has minimum distance to the node? size_t closestCluster = centroids.n_cols; double minMinDistance = DBL_MAX; for (size_t i = 0; i < centroids.n_cols; ++i) { if (referenceNode.Stat().Blacklist()[i] == 0) { const double minDistance = referenceNode.MinDistance(centroids.col(i)); if (minDistance < minMinDistance) { minMinDistance = minDistance; closestCluster = i; } } } // Now, for every other whitelisted cluster, determine if the closest cluster // owns the point. This calculation is specific to hyperrectangle trees (but, // this implementation is specific to kd-trees, so that's okay). For // circular-bound trees, the condition should be simpler and can probably be // expressed as a comparison between minimum and maximum distances. size_t newBlacklisted = 0; for (size_t c = 0; c < centroids.n_cols; ++c) { if (referenceNode.Stat().Blacklist()[c] == 1 || c == closestCluster) continue; // This algorithm comes from the proof of Lemma 4 in the extended version // of the Pelleg-Moore paper (the CMU tech report, that is). It has been // adapted for speed. arma::vec cornerPoint(centroids.n_rows); for (size_t d = 0; d < referenceNode.Bound().Dim(); ++d) { if (centroids(d, c) > centroids(d, closestCluster)) cornerPoint(d) = referenceNode.Bound()[d].Hi(); else cornerPoint(d) = referenceNode.Bound()[d].Lo(); } const double closestDist = metric.Evaluate(cornerPoint, centroids.col(closestCluster)); const double otherDist = metric.Evaluate(cornerPoint, centroids.col(c)); distanceCalculations += 3; // One for cornerPoint, then two distances. if (closestDist < otherDist) { // The closest cluster dominates the node with respect to the cluster c. // So we can blacklist c. referenceNode.Stat().Blacklist()[c] = 1; ++newBlacklisted; } } if (whitelisted - newBlacklisted == 1) { // This node is dominated by the closest cluster. counts[closestCluster] += referenceNode.NumDescendants(); newCentroids.col(closestCluster) += referenceNode.NumDescendants() * referenceNode.Stat().Centroid(); return DBL_MAX; } // Perform the base case here. for (size_t i = 0; i < referenceNode.NumPoints(); ++i) { size_t bestCluster = centroids.n_cols; double bestDistance = DBL_MAX; for (size_t c = 0; c < centroids.n_cols; ++c) { if (referenceNode.Stat().Blacklist()[c] == 1) continue; ++distanceCalculations; // The reference index is the index of the data point. const double distance = metric.Evaluate(centroids.col(c), dataset.col(referenceNode.Point(i))); if (distance < bestDistance) { bestDistance = distance; bestCluster = c; } } // Add to resulting centroid. newCentroids.col(bestCluster) += dataset.col(referenceNode.Point(i)); ++counts(bestCluster); } // Otherwise, we're not sure, so we can't prune. Recursion order doesn't make // a difference, so we'll just return a score of 0. return 0.0; }
void CheckTrees(TreeType& tree, TreeType& xmlTree, TreeType& textTree, TreeType& binaryTree) { const typename TreeType::Mat* dataset = &tree.Dataset(); // Make sure that the data matrices are the same. if (tree.Parent() == NULL) { CheckMatrices(*dataset, xmlTree.Dataset(), textTree.Dataset(), binaryTree.Dataset()); // Also ensure that the other parents are null too. BOOST_REQUIRE_EQUAL(xmlTree.Parent(), (TreeType*) NULL); BOOST_REQUIRE_EQUAL(textTree.Parent(), (TreeType*) NULL); BOOST_REQUIRE_EQUAL(binaryTree.Parent(), (TreeType*) NULL); } // Make sure the number of children is the same. BOOST_REQUIRE_EQUAL(tree.NumChildren(), xmlTree.NumChildren()); BOOST_REQUIRE_EQUAL(tree.NumChildren(), textTree.NumChildren()); BOOST_REQUIRE_EQUAL(tree.NumChildren(), binaryTree.NumChildren()); // Make sure the number of descendants is the same. BOOST_REQUIRE_EQUAL(tree.NumDescendants(), xmlTree.NumDescendants()); BOOST_REQUIRE_EQUAL(tree.NumDescendants(), textTree.NumDescendants()); BOOST_REQUIRE_EQUAL(tree.NumDescendants(), binaryTree.NumDescendants()); // Make sure the number of points is the same. BOOST_REQUIRE_EQUAL(tree.NumPoints(), xmlTree.NumPoints()); BOOST_REQUIRE_EQUAL(tree.NumPoints(), textTree.NumPoints()); BOOST_REQUIRE_EQUAL(tree.NumPoints(), binaryTree.NumPoints()); // Check that each point is the same. for (size_t i = 0; i < tree.NumPoints(); ++i) { BOOST_REQUIRE_EQUAL(tree.Point(i), xmlTree.Point(i)); BOOST_REQUIRE_EQUAL(tree.Point(i), textTree.Point(i)); BOOST_REQUIRE_EQUAL(tree.Point(i), binaryTree.Point(i)); } // Check that the parent distance is the same. BOOST_REQUIRE_CLOSE(tree.ParentDistance(), xmlTree.ParentDistance(), 1e-8); BOOST_REQUIRE_CLOSE(tree.ParentDistance(), textTree.ParentDistance(), 1e-8); BOOST_REQUIRE_CLOSE(tree.ParentDistance(), binaryTree.ParentDistance(), 1e-8); // Check that the furthest descendant distance is the same. BOOST_REQUIRE_CLOSE(tree.FurthestDescendantDistance(), xmlTree.FurthestDescendantDistance(), 1e-8); BOOST_REQUIRE_CLOSE(tree.FurthestDescendantDistance(), textTree.FurthestDescendantDistance(), 1e-8); BOOST_REQUIRE_CLOSE(tree.FurthestDescendantDistance(), binaryTree.FurthestDescendantDistance(), 1e-8); // Check that the minimum bound distance is the same. BOOST_REQUIRE_CLOSE(tree.MinimumBoundDistance(), xmlTree.MinimumBoundDistance(), 1e-8); BOOST_REQUIRE_CLOSE(tree.MinimumBoundDistance(), textTree.MinimumBoundDistance(), 1e-8); BOOST_REQUIRE_CLOSE(tree.MinimumBoundDistance(), binaryTree.MinimumBoundDistance(), 1e-8); // Recurse into the children. for (size_t i = 0; i < tree.NumChildren(); ++i) { // Check that the child dataset is the same. BOOST_REQUIRE_EQUAL(&xmlTree.Dataset(), &xmlTree.Child(i).Dataset()); BOOST_REQUIRE_EQUAL(&textTree.Dataset(), &textTree.Child(i).Dataset()); BOOST_REQUIRE_EQUAL(&binaryTree.Dataset(), &binaryTree.Child(i).Dataset()); // Make sure the parent link is right. BOOST_REQUIRE_EQUAL(xmlTree.Child(i).Parent(), &xmlTree); BOOST_REQUIRE_EQUAL(textTree.Child(i).Parent(), &textTree); BOOST_REQUIRE_EQUAL(binaryTree.Child(i).Parent(), &binaryTree); CheckTrees(tree.Child(i), xmlTree.Child(i), textTree.Child(i), binaryTree.Child(i)); } }
inline double KDERules<MetricType, KernelType, TreeType>:: Score(TreeType& queryNode, TreeType& referenceNode) { double score, maxKernel, minKernel, bound; const double minDistance = queryNode.MinDistance(referenceNode); // Calculations are not duplicated. bool newCalculations = true; if (tree::TreeTraits<TreeType>::FirstPointIsCentroid && (traversalInfo.LastQueryNode() != NULL) && (traversalInfo.LastReferenceNode() != NULL) && (traversalInfo.LastQueryNode()->Point(0) == queryNode.Point(0)) && (traversalInfo.LastReferenceNode()->Point(0) == referenceNode.Point(0))) { // Don't duplicate calculations. newCalculations = false; lastQueryIndex = queryNode.Point(0); lastReferenceIndex = referenceNode.Point(0); } else { // Calculations are new. maxKernel = kernel.Evaluate(minDistance); minKernel = kernel.Evaluate(queryNode.MaxDistance(referenceNode)); bound = maxKernel - minKernel; } // If possible, avoid some calculations because of the error tolerance. if (newCalculations && bound <= (absError + relError * minKernel) / referenceSet.n_cols) { // Auxiliary variables. double kernelValue; kde::KDEStat& referenceStat = referenceNode.Stat(); kde::KDEStat& queryStat = queryNode.Stat(); // If calculating a center is not required. if (tree::TreeTraits<TreeType>::FirstPointIsCentroid) { kernelValue = EvaluateKernel(queryNode.Point(0), referenceNode.Point(0)); } // Sadly, we have no choice but to calculate the center. else { kernelValue = EvaluateKernel(queryStat.Centroid(), referenceStat.Centroid()); } // Sum up estimations. for (size_t i = 0; i < queryNode.NumDescendants(); ++i) { densities(queryNode.Descendant(i)) += referenceNode.NumDescendants() * kernelValue; } score = DBL_MAX; } else { score = minDistance; } ++scores; traversalInfo.LastQueryNode() = &queryNode; traversalInfo.LastReferenceNode() = &referenceNode; traversalInfo.LastScore() = score; return score; }
void CheckDistance(TreeType& tree, TreeType* node = NULL) { typedef typename TreeType::ElemType ElemType; if (node == NULL) { node = &tree; while (node->Parent() != NULL) node = node->Parent(); CheckDistance<TreeType, MetricType>(tree, node); for (size_t j = 0; j < tree.Dataset().n_cols; j++) { const arma::Col<ElemType>& point = tree. Dataset().col(j); ElemType maxDist = 0; ElemType minDist = std::numeric_limits<ElemType>::max(); for (size_t i = 0; i < tree.NumDescendants(); i++) { ElemType dist = MetricType::Evaluate( tree.Dataset().col(tree.Descendant(i)), tree.Dataset().col(j)); if (dist > maxDist) maxDist = dist; if (dist < minDist) minDist = dist; } BOOST_REQUIRE_LE(tree.Bound().MinDistance(point), minDist * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); BOOST_REQUIRE_LE(maxDist, tree.Bound().MaxDistance(point) * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); math::RangeType<ElemType> r = tree.Bound().RangeDistance(point); BOOST_REQUIRE_LE(r.Lo(), minDist * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); BOOST_REQUIRE_LE(maxDist, r.Hi() * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); } if (!tree.IsLeaf()) { CheckDistance<TreeType, MetricType>(*tree.Left()); CheckDistance<TreeType, MetricType>(*tree.Right()); } } else { if (&tree != node) { ElemType maxDist = 0; ElemType minDist = std::numeric_limits<ElemType>::max(); for (size_t i = 0; i < tree.NumDescendants(); i++) for (size_t j = 0; j < node->NumDescendants(); j++) { ElemType dist = MetricType::Evaluate( tree.Dataset().col(tree.Descendant(i)), node->Dataset().col(node->Descendant(j))); if (dist > maxDist) maxDist = dist; if (dist < minDist) minDist = dist; } BOOST_REQUIRE_LE(tree.Bound().MinDistance(node->Bound()), minDist * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); BOOST_REQUIRE_LE(maxDist, tree.Bound().MaxDistance(node->Bound()) * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); math::RangeType<ElemType> r = tree.Bound().RangeDistance(node->Bound()); BOOST_REQUIRE_LE(r.Lo(), minDist * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); BOOST_REQUIRE_LE(maxDist, r.Hi() * (1.0 + 10 * std::numeric_limits<ElemType>::epsilon())); } if (!node->IsLeaf()) { CheckDistance<TreeType, MetricType>(tree, node->Left()); CheckDistance<TreeType, MetricType>(tree, node->Right()); } } }
double RangeSearchRules<MetricType, TreeType>::Score(TreeType& queryNode, TreeType& referenceNode) { math::Range distances; if (tree::TreeTraits<TreeType>::FirstPointIsCentroid) { // It is possible that the base case has already been calculated. double baseCase = 0.0; bool alreadyDone = false; if (tree::TreeTraits<TreeType>::HasSelfChildren) { TreeType* lastQuery = (TreeType*) referenceNode.Stat().LastDistanceNode(); TreeType* lastRef = (TreeType*) queryNode.Stat().LastDistanceNode(); // Did the query node's last combination do the base case? if ((lastRef != NULL) && (referenceNode.Point(0) == lastRef->Point(0))) { baseCase = queryNode.Stat().LastDistance(); alreadyDone = true; } // Did the reference node's last combination do the base case? if ((lastQuery != NULL) && (queryNode.Point(0) == lastQuery->Point(0))) { baseCase = referenceNode.Stat().LastDistance(); alreadyDone = true; } // If the query node is a self-child, did the query parent's last // combination do the base case? if ((queryNode.Parent() != NULL) && (queryNode.Point(0) == queryNode.Parent()->Point(0))) { TreeType* lastParentRef = (TreeType*) queryNode.Parent()->Stat().LastDistanceNode(); if ((lastParentRef != NULL) && (referenceNode.Point(0) == lastParentRef->Point(0))) { baseCase = queryNode.Parent()->Stat().LastDistance(); alreadyDone = true; } } // If the reference node is a self-child, did the reference parent's last // combination do the base case? if ((referenceNode.Parent() != NULL) && (referenceNode.Point(0) == referenceNode.Parent()->Point(0))) { TreeType* lastQueryRef = (TreeType*) referenceNode.Parent()->Stat().LastDistanceNode(); if ((lastQueryRef != NULL) && (queryNode.Point(0) == lastQueryRef->Point(0))) { baseCase = referenceNode.Parent()->Stat().LastDistance(); alreadyDone = true; } } } if (!alreadyDone) { // We must calculate the base case. baseCase = BaseCase(queryNode.Point(0), referenceNode.Point(0)); } else { // Make sure that if BaseCase() is called, we don't duplicate results. lastQueryIndex = queryNode.Point(0); lastReferenceIndex = referenceNode.Point(0); } distances.Lo() = baseCase - queryNode.FurthestDescendantDistance() - referenceNode.FurthestDescendantDistance(); distances.Hi() = baseCase + queryNode.FurthestDescendantDistance() + referenceNode.FurthestDescendantDistance(); // Update the last distances performed for the query and reference node. queryNode.Stat().LastDistanceNode() = (void*) &referenceNode; queryNode.Stat().LastDistance() = baseCase; referenceNode.Stat().LastDistanceNode() = (void*) &queryNode; referenceNode.Stat().LastDistance() = baseCase; } else { // Just perform the calculation. distances = referenceNode.RangeDistance(&queryNode); } // If the ranges do not overlap, prune this node. if (!distances.Contains(range)) return DBL_MAX; // In this case, all of the points in the reference node will be part of all // the results for each point in the query node. if ((distances.Lo() >= range.Lo()) && (distances.Hi() <= range.Hi())) { for (size_t i = 0; i < queryNode.NumDescendants(); ++i) AddResult(queryNode.Descendant(i), referenceNode); return DBL_MAX; // We don't need to go any deeper. } // Otherwise the score doesn't matter. Recursion order is irrelevant in range // search. return 0.0; }