mlpack
pelleg_moore_kmeans_rules_impl.hpp
Go to the documentation of this file.
1 
14 #ifndef MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_RULES_IMPL_HPP
15 #define MLPACK_METHODS_KMEANS_PELLEG_MOORE_KMEANS_RULES_IMPL_HPP
16 
17 // In case it hasn't been included yet.
19 
20 namespace mlpack {
21 namespace kmeans {
22 
23 template<typename MetricType, typename TreeType>
25  const typename TreeType::Mat& dataset,
26  const arma::mat& centroids,
27  arma::mat& newCentroids,
28  arma::Col<size_t>& counts,
29  MetricType& metric) :
30  dataset(dataset),
31  centroids(centroids),
32  newCentroids(newCentroids),
33  counts(counts),
34  metric(metric),
35  distanceCalculations(0)
36 {
37  // Nothing to do.
38 }
39 
40 template<typename MetricType, typename TreeType>
41 inline force_inline
43  const size_t /* queryIndex */,
44  const size_t /* referenceIndex */)
45 {
46  return 0.0;
47 }
48 
49 template<typename MetricType, typename TreeType>
51  const size_t /* queryIndex */,
52  TreeType& referenceNode)
53 {
54  // Obtain the parent's blacklist. If this is the root node, we'll start with
55  // an empty blacklist. This means that after each iteration, we don't need to
56  // reset any statistics.
57  if (referenceNode.Parent() == NULL ||
58  referenceNode.Parent()->Stat().Blacklist().n_elem == 0)
59  referenceNode.Stat().Blacklist().zeros(centroids.n_cols);
60  else
61  referenceNode.Stat().Blacklist() =
62  referenceNode.Parent()->Stat().Blacklist();
63 
64  // The query index is a fake index that we won't use, and the reference node
65  // holds all of the points in the dataset. Our goal is to determine whether
66  // or not this node is dominated by a single cluster.
67  const size_t whitelisted = centroids.n_cols -
68  arma::accu(referenceNode.Stat().Blacklist());
69 
70  distanceCalculations += whitelisted;
71 
72  // Which cluster has minimum distance to the node?
73  size_t closestCluster = centroids.n_cols;
74  double minMinDistance = DBL_MAX;
75  for (size_t i = 0; i < centroids.n_cols; ++i)
76  {
77  if (referenceNode.Stat().Blacklist()[i] == 0)
78  {
79  const double minDistance = referenceNode.MinDistance(centroids.col(i));
80  if (minDistance < minMinDistance)
81  {
82  minMinDistance = minDistance;
83  closestCluster = i;
84  }
85  }
86  }
87 
88  // Now, for every other whitelisted cluster, determine if the closest cluster
89  // owns the point. This calculation is specific to hyperrectangle trees (but,
90  // this implementation is specific to kd-trees, so that's okay). For
91  // circular-bound trees, the condition should be simpler and can probably be
92  // expressed as a comparison between minimum and maximum distances.
93  size_t newBlacklisted = 0;
94  for (size_t c = 0; c < centroids.n_cols; ++c)
95  {
96  if (referenceNode.Stat().Blacklist()[c] == 1 || c == closestCluster)
97  continue;
98 
99  // This algorithm comes from the proof of Lemma 4 in the extended version
100  // of the Pelleg-Moore paper (the CMU tech report, that is). It has been
101  // adapted for speed.
102  arma::vec cornerPoint(centroids.n_rows);
103  for (size_t d = 0; d < referenceNode.Bound().Dim(); ++d)
104  {
105  if (centroids(d, c) > centroids(d, closestCluster))
106  cornerPoint(d) = referenceNode.Bound()[d].Hi();
107  else
108  cornerPoint(d) = referenceNode.Bound()[d].Lo();
109  }
110 
111  const double closestDist = metric.Evaluate(cornerPoint,
112  centroids.col(closestCluster));
113  const double otherDist = metric.Evaluate(cornerPoint, centroids.col(c));
114 
115  distanceCalculations += 3; // One for cornerPoint, then two distances.
116 
117  if (closestDist < otherDist)
118  {
119  // The closest cluster dominates the node with respect to the cluster c.
120  // So we can blacklist c.
121  referenceNode.Stat().Blacklist()[c] = 1;
122  ++newBlacklisted;
123  }
124  }
125 
126  if (whitelisted - newBlacklisted == 1)
127  {
128  // This node is dominated by the closest cluster.
129  counts[closestCluster] += referenceNode.NumDescendants();
130  newCentroids.col(closestCluster) += referenceNode.NumDescendants() *
131  referenceNode.Stat().Centroid();
132 
133  return DBL_MAX;
134  }
135 
136  // Perform the base case here.
137  for (size_t i = 0; i < referenceNode.NumPoints(); ++i)
138  {
139  size_t bestCluster = centroids.n_cols;
140  double bestDistance = DBL_MAX;
141  for (size_t c = 0; c < centroids.n_cols; ++c)
142  {
143  if (referenceNode.Stat().Blacklist()[c] == 1)
144  continue;
145 
146  ++distanceCalculations;
147 
148  // The reference index is the index of the data point.
149  const double distance = metric.Evaluate(centroids.col(c),
150  dataset.col(referenceNode.Point(i)));
151 
152  if (distance < bestDistance)
153  {
154  bestDistance = distance;
155  bestCluster = c;
156  }
157  }
158 
159  // Add to resulting centroid.
160  newCentroids.col(bestCluster) += dataset.col(referenceNode.Point(i));
161  ++counts(bestCluster);
162  }
163 
164  // Otherwise, we're not sure, so we can't prune. Recursion order doesn't make
165  // a difference, so we'll just return a score of 0.
166  return 0.0;
167 }
168 
169 template<typename MetricType, typename TreeType>
171  const size_t /* queryIndex */,
172  TreeType& /* referenceNode */,
173  const double oldScore)
174 {
175  // There's no possible way that calling Rescore() can produce a prune now when
176  // it couldn't before.
177  return oldScore;
178 }
179 
180 } // namespace kmeans
181 } // namespace mlpack
182 
183 #endif
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
double Rescore(const size_t queryIndex, TreeType &referenceNode, const double oldScore)
Rescore to determine if a node can be pruned.
Definition: pelleg_moore_kmeans_rules_impl.hpp:170
double BaseCase(const size_t queryIndex, const size_t referenceIndex)
The BaseCase() function for this single-tree algorithm does nothing.
Definition: pelleg_moore_kmeans_rules_impl.hpp:42
PellegMooreKMeansRules(const typename TreeType::Mat &dataset, const arma::mat &centroids, arma::mat &newCentroids, arma::Col< size_t > &counts, MetricType &metric)
Create the PellegMooreKMeansRules object.
Definition: pelleg_moore_kmeans_rules_impl.hpp:24
double Score(const size_t queryIndex, TreeType &referenceNode)
Determine if a cluster can be pruned, and if not, perform point-to-cluster comparisons.
Definition: pelleg_moore_kmeans_rules_impl.hpp:50