mlpack
elkan_kmeans_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_METHODS_KMEANS_ELKAN_KMEANS_IMPL_HPP
13 #define MLPACK_METHODS_KMEANS_ELKAN_KMEANS_IMPL_HPP
14 
15 // In case it hasn't been included yet.
16 #include "elkan_kmeans.hpp"
17 
18 namespace mlpack {
19 namespace kmeans {
20 
21 template<typename MetricType, typename MatType>
23  MetricType& metric) :
24  dataset(dataset),
25  metric(metric),
26  distanceCalculations(0)
27 {
28  // Nothing to do here.
29 }
30 
31 // Run a single iteration of Elkan's algorithm for Lloyd iterations.
32 template<typename MetricType, typename MatType>
33 double ElkanKMeans<MetricType, MatType>::Iterate(const arma::mat& centroids,
34  arma::mat& newCentroids,
35  arma::Col<size_t>& counts)
36 {
37  // Clear new centroids.
38  newCentroids.zeros(centroids.n_rows, centroids.n_cols);
39  counts.zeros(centroids.n_cols);
40 
41  // At the beginning of the iteration, we must compute the distances between
42  // all centers. This is O(k^2).
43  clusterDistances.set_size(centroids.n_cols, centroids.n_cols);
44 
45  // Self-distances are always 0, but we set them to DBL_MAX to avoid the self
46  // being the closest cluster centroid.
47  clusterDistances.diag().fill(DBL_MAX);
48 
49  // Initially set r(x) to true.
50  std::vector<bool> mustRecalculate(dataset.n_cols, true);
51 
52  // If this is the first iteration, we must reset all the bounds.
53  if (lowerBounds.n_rows != centroids.n_cols)
54  {
55  lowerBounds.set_size(centroids.n_cols, dataset.n_cols);
56  assignments.set_size(dataset.n_cols);
57  upperBounds.set_size(dataset.n_cols);
58 
59  lowerBounds.fill(0);
60  upperBounds.fill(DBL_MAX);
61  assignments.fill(0);
62  }
63 
64  // Step 1: for all centers, compute between-cluster distances. For all
65  // centers, compute s(c) = 1/2 min d(c, c').
66  for (size_t i = 0; i < centroids.n_cols; ++i)
67  {
68  for (size_t j = i + 1; j < centroids.n_cols; ++j)
69  {
70  const double distance = metric.Evaluate(centroids.col(i),
71  centroids.col(j));
72  distanceCalculations++;
73  clusterDistances(i, j) = distance;
74  clusterDistances(j, i) = distance;
75  }
76  }
77 
78  // Now find the closest cluster to each other cluster. We multiply by 0.5 so
79  // that this is equivalent to s(c) for each cluster c.
80  minClusterDistances = 0.5 * arma::min(clusterDistances).t();
81 
82  // Now loop over all points, and see which ones need to be updated.
83  for (size_t i = 0; i < dataset.n_cols; ++i)
84  {
85  // Step 2: identify all points such that u(x) <= s(c(x)).
86  if (upperBounds(i) <= minClusterDistances(assignments[i]))
87  {
88  // No change needed. This point must still belong to that cluster.
89  counts(assignments[i])++;
90  newCentroids.col(assignments[i]) += arma::vec(dataset.col(i));
91  continue;
92  }
93  else
94  {
95  for (size_t c = 0; c < centroids.n_cols; ++c)
96  {
97  // Step 3: for all remaining points x and centers c such that c != c(x),
98  // u(x) > l(x, c) and u(x) > 0.5 d(c(x), c)...
99  if (assignments[i] == c)
100  continue; // Pruned because this cluster is already the assignment.
101 
102  if (upperBounds(i) <= lowerBounds(c, i))
103  continue; // Pruned by triangle inequality on lower bound.
104 
105  if (upperBounds(i) <= 0.5 * clusterDistances(assignments[i], c))
106  continue; // Pruned by triangle inequality on cluster distances.
107 
108  // Step 3a: if r(x) then compute d(x, c(x)) and assign r(x) = false.
109  // Otherwise, d(x, c(x)) = u(x).
110  double dist;
111  if (mustRecalculate[i])
112  {
113  mustRecalculate[i] = false;
114  dist = metric.Evaluate(dataset.col(i), centroids.col(assignments[i]));
115  lowerBounds(assignments[i], i) = dist;
116  upperBounds(i) = dist;
117  distanceCalculations++;
118 
119  // Check if we can prune again.
120  if (upperBounds(i) <= lowerBounds(c, i))
121  continue; // Pruned by triangle inequality on lower bound.
122 
123  if (upperBounds(i) <= 0.5 * clusterDistances(assignments[i], c))
124  continue; // Pruned by triangle inequality on cluster distances.
125  }
126  else
127  {
128  dist = upperBounds(i); // This is equivalent to d(x, c(x)).
129  }
130 
131  // Step 3b: if d(x, c(x)) > l(x, c) or d(x, c(x)) > 0.5 d(c(x), c)...
132  if (dist > lowerBounds(c, i) ||
133  dist > 0.5 * clusterDistances(assignments[i], c))
134  {
135  // Compute d(x, c). If d(x, c) < d(x, c(x)) then assign c(x) = c.
136  const double pointDist = metric.Evaluate(dataset.col(i),
137  centroids.col(c));
138  lowerBounds(c, i) = pointDist;
139  distanceCalculations++;
140  if (pointDist < dist)
141  {
142  upperBounds(i) = pointDist;
143  assignments[i] = c;
144  }
145  }
146  }
147  }
148 
149  // At this point, we know the new cluster assignment.
150  // Step 4: for each center c, let m(c) be the mean of the points assigned to
151  // c.
152  newCentroids.col(assignments[i]) += arma::vec(dataset.col(i));
153  counts[assignments[i]]++;
154  }
155 
156  // Now, normalize and calculate the distance each cluster has moved.
157  arma::vec moveDistances(centroids.n_cols);
158  double cNorm = 0.0; // Cluster movement for residual.
159  for (size_t c = 0; c < centroids.n_cols; ++c)
160  {
161  if (counts[c] > 0)
162  newCentroids.col(c) /= counts[c];
163 
164  moveDistances(c) = metric.Evaluate(newCentroids.col(c), centroids.col(c));
165  cNorm += std::pow(moveDistances(c), 2.0);
166  distanceCalculations++;
167  }
168 
169  for (size_t i = 0; i < dataset.n_cols; ++i)
170  {
171  // Step 5: for each point x and center c, assign
172  // l(x, c) = max { l(x, c) - d(c, m(c)), 0 }.
173  // But it doesn't actually matter if l(x, c) is positive.
174  for (size_t c = 0; c < centroids.n_cols; ++c)
175  lowerBounds(c, i) -= moveDistances(c);
176 
177  // Step 6: for each point x, assign
178  // u(x) = u(x) + d(m(c(x)), c(x))
179  // r(x) = true (we are setting that at the start of every iteration).
180  upperBounds(i) += moveDistances(assignments[i]);
181  }
182 
183  return std::sqrt(cNorm);
184 }
185 
186 } // namespace kmeans
187 } // namespace mlpack
188 
189 #endif
ElkanKMeans(const MatType &dataset, MetricType &metric)
Construct the ElkanKMeans object, which must store several sets of bounds.
Definition: elkan_kmeans_impl.hpp:22
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
double Iterate(const arma::mat &centroids, arma::mat &newCentroids, arma::Col< size_t > &counts)
Run a single iteration of Elkan&#39;s algorithm, updating the given centroids into the newCentroids matri...
Definition: elkan_kmeans_impl.hpp:33