mlpack
kmeans_impl.hpp
Go to the documentation of this file.
1 
13 #include "kmeans.hpp"
14 
17 
18 namespace mlpack {
19 namespace kmeans {
20 
25 HAS_MEM_FUNC(Cluster, GivesCentroidsCheck);
26 
31 template<typename InitialPartitionPolicy>
33 {
34  static const bool value =
35  // Non-static version.
36  GivesCentroidsCheck<InitialPartitionPolicy,
37  void(InitialPartitionPolicy::*)(const arma::mat&,
38  const size_t,
39  arma::mat&)>::value ||
40  // Static version.
41  GivesCentroidsCheck<InitialPartitionPolicy,
42  void(*)(const arma::mat&, const size_t, arma::mat&)>::value;
43 };
44 
47 template<typename MatType,
48  typename InitialPartitionPolicy>
50  InitialPartitionPolicy& ipp,
51  const MatType& data,
52  const size_t clusters,
53  arma::Row<size_t>& assignments,
54  arma::mat& /* centroids */,
55  const typename std::enable_if_t<
57 {
58  ipp.Cluster(data, clusters, assignments);
59 
60  return true;
61 }
62 
65 template<typename MatType,
66  typename InitialPartitionPolicy>
68  InitialPartitionPolicy& ipp,
69  const MatType& data,
70  const size_t clusters,
71  arma::Row<size_t>& /* assignments */,
72  arma::mat& centroids,
73  const typename std::enable_if_t<
75 {
76  ipp.Cluster(data, clusters, centroids);
77 
78  return false;
79 }
80 
84 template<typename MetricType,
85  typename InitialPartitionPolicy,
86  typename EmptyClusterPolicy,
87  template<class, class> class LloydStepType,
88  typename MatType>
89 KMeans<
90  MetricType,
91  InitialPartitionPolicy,
92  EmptyClusterPolicy,
93  LloydStepType,
94  MatType>::
95 KMeans(const size_t maxIterations,
96  const MetricType metric,
97  const InitialPartitionPolicy partitioner,
98  const EmptyClusterPolicy emptyClusterAction) :
99  maxIterations(maxIterations),
100  metric(metric),
101  partitioner(partitioner),
102  emptyClusterAction(emptyClusterAction)
103 {
104  // Nothing to do.
105 }
106 
113 template<typename MetricType,
114  typename InitialPartitionPolicy,
115  typename EmptyClusterPolicy,
116  template<class, class> class LloydStepType,
117  typename MatType>
118 inline void KMeans<
119  MetricType,
120  InitialPartitionPolicy,
121  EmptyClusterPolicy,
122  LloydStepType,
123  MatType>::
124 Cluster(const MatType& data,
125  const size_t clusters,
126  arma::Row<size_t>& assignments,
127  const bool initialGuess)
128 {
129  arma::mat centroids(data.n_rows, clusters);
130  Cluster(data, clusters, assignments, centroids, initialGuess);
131 }
132 
137 template<typename MetricType,
138  typename InitialPartitionPolicy,
139  typename EmptyClusterPolicy,
140  template<class, class> class LloydStepType,
141  typename MatType>
142 void KMeans<
143  MetricType,
144  InitialPartitionPolicy,
145  EmptyClusterPolicy,
146  LloydStepType,
147  MatType>::
148 Cluster(const MatType& data,
149  const size_t clusters,
150  arma::mat& centroids,
151  const bool initialGuess)
152 {
153  // Make sure we have more points than clusters.
154  if (clusters > data.n_cols)
155  Log::Warn << "KMeans::Cluster(): more clusters requested than points given."
156  << std::endl;
157  else if (clusters == 0)
158  Log::Warn << "KMeans::Cluster(): zero clusters requested. This probably "
159  << "isn't going to work. Brace for crash." << std::endl;
160 
161  // Check validity of initial guess.
162  if (initialGuess)
163  {
164  if (centroids.n_cols != clusters)
165  Log::Fatal << "KMeans::Cluster(): wrong number of initial cluster "
166  << "centroids (" << centroids.n_cols << ", should be " << clusters
167  << ")!" << std::endl;
168 
169  if (centroids.n_rows != data.n_rows)
170  Log::Fatal << "KMeans::Cluster(): initial cluster centroids have wrong "
171  << " dimensionality (" << centroids.n_rows << ", should be "
172  << data.n_rows << ")!" << std::endl;
173  }
174 
175  // Use the partitioner to come up with the partition assignments and calculate
176  // the initial centroids.
177  if (!initialGuess)
178  {
179  // The GetInitialAssignmentsOrCentroids() function will call the appropriate
180  // function in the InitialPartitionPolicy to return either assignments or
181  // centroids. We prefer centroids, but if assignments are returned, then we
182  // have to calculate the initial centroids for the first iteration.
183  arma::Row<size_t> assignments;
184  bool gotAssignments = GetInitialAssignmentsOrCentroids(partitioner, data,
185  clusters, assignments, centroids);
186  if (gotAssignments)
187  {
188  // The partitioner gives assignments, so we need to calculate centroids
189  // from those assignments.
190  arma::Row<size_t> counts;
191  counts.zeros(clusters);
192  centroids.zeros(data.n_rows, clusters);
193  for (size_t i = 0; i < data.n_cols; ++i)
194  {
195  centroids.col(assignments[i]) += arma::vec(data.col(i));
196  counts[assignments[i]]++;
197  }
198 
199  for (size_t i = 0; i < clusters; ++i)
200  if (counts[i] != 0)
201  centroids.col(i) /= counts[i];
202  }
203  }
204 
205  // Counts of points in each cluster.
206  arma::Col<size_t> counts(clusters);
207 
208  size_t iteration = 0;
209 
210  LloydStepType<MetricType, MatType> lloydStep(data, metric);
211  arma::mat centroidsOther;
212  double cNorm;
213 
214  do
215  {
216  // We have two centroid matrices. We don't want to copy anything, so,
217  // depending on the iteration number, we use a different centroid matrix...
218  if (iteration % 2 == 0)
219  cNorm = lloydStep.Iterate(centroids, centroidsOther, counts);
220  else
221  cNorm = lloydStep.Iterate(centroidsOther, centroids, counts);
222 
223  // If we are not allowing empty clusters, then check that all of our
224  // clusters have points.
225  for (size_t i = 0; i < counts.n_elem; ++i)
226  {
227  if (counts[i] == 0)
228  {
229  Log::Info << "Cluster " << i << " is empty.\n";
230  if (iteration % 2 == 0)
231  emptyClusterAction.EmptyCluster(data, i, centroids, centroidsOther,
232  counts, metric, iteration);
233  else
234  emptyClusterAction.EmptyCluster(data, i, centroidsOther, centroids,
235  counts, metric, iteration);
236  }
237  }
238 
239  iteration++;
240  Log::Info << "KMeans::Cluster(): iteration " << iteration << ", residual "
241  << cNorm << ".\n";
242  if (std::isnan(cNorm) || std::isinf(cNorm))
243  cNorm = 1e-4; // Keep iterating.
244  } while (cNorm > 1e-5 && iteration != maxIterations);
245 
246  // If we ended on an even iteration, then the centroids are in the
247  // centroidsOther matrix, and we need to steal its memory (steal_mem() avoids
248  // a copy if possible).
249  if ((iteration - 1) % 2 == 0)
250  centroids.steal_mem(centroidsOther);
251 
252  if (iteration != maxIterations)
253  {
254  Log::Info << "KMeans::Cluster(): converged after " << iteration
255  << " iterations." << std::endl;
256  }
257  else
258  {
259  Log::Info << "KMeans::Cluster(): terminated after limit of " << iteration
260  << " iterations." << std::endl;
261  }
262  Log::Info << lloydStep.DistanceCalculations() << " distance calculations."
263  << std::endl;
264 }
265 
270 template<typename MetricType,
271  typename InitialPartitionPolicy,
272  typename EmptyClusterPolicy,
273  template<class, class> class LloydStepType,
274  typename MatType>
275 void KMeans<
276  MetricType,
277  InitialPartitionPolicy,
278  EmptyClusterPolicy,
279  LloydStepType,
280  MatType>::
281 Cluster(const MatType& data,
282  const size_t clusters,
283  arma::Row<size_t>& assignments,
284  arma::mat& centroids,
285  const bool initialAssignmentGuess,
286  const bool initialCentroidGuess)
287 {
288  // Now, the initial assignments. First determine if they are necessary.
289  if (initialAssignmentGuess)
290  {
291  if (assignments.n_elem != data.n_cols)
292  Log::Fatal << "KMeans::Cluster(): initial cluster assignments (length "
293  << assignments.n_elem << ") not the same size as the dataset (size "
294  << data.n_cols << ")!" << std::endl;
295 
296  // Calculate initial centroids.
297  arma::Row<size_t> counts;
298  counts.zeros(clusters);
299  centroids.zeros(data.n_rows, clusters);
300  for (size_t i = 0; i < data.n_cols; ++i)
301  {
302  centroids.col(assignments[i]) += arma::vec(data.col(i));
303  counts[assignments[i]]++;
304  }
305 
306  for (size_t i = 0; i < clusters; ++i)
307  if (counts[i] != 0)
308  centroids.col(i) /= counts[i];
309  }
310 
311  Cluster(data, clusters, centroids,
312  initialAssignmentGuess || initialCentroidGuess);
313 
314  // Calculate final assignments in parallel over the entire dataset.
315  assignments.set_size(data.n_cols);
316 
317  #pragma omp parallel for
318  for (omp_size_t i = 0; i < (omp_size_t) data.n_cols; ++i)
319  {
320  // Find the closest centroid to this point.
321  double minDistance = std::numeric_limits<double>::infinity();
322  size_t closestCluster = centroids.n_cols; // Invalid value.
323 
324  for (size_t j = 0; j < centroids.n_cols; ++j)
325  {
326  const double distance = metric.Evaluate(data.col(i), centroids.col(j));
327 
328  if (distance < minDistance)
329  {
330  minDistance = distance;
331  closestCluster = j;
332  }
333  }
334 
335  Log::Assert(closestCluster != centroids.n_cols);
336  assignments[i] = closestCluster;
337  }
338 }
339 
340 template<typename MetricType,
341  typename InitialPartitionPolicy,
342  typename EmptyClusterPolicy,
343  template<class, class> class LloydStepType,
344  typename MatType>
345 template<typename Archive>
346 void KMeans<MetricType,
347  InitialPartitionPolicy,
348  EmptyClusterPolicy,
349  LloydStepType,
350  MatType>::serialize(Archive& ar, const uint32_t /* version */)
351 {
352  ar(CEREAL_NVP(maxIterations));
353  ar(CEREAL_NVP(metric));
354  ar(CEREAL_NVP(partitioner));
355  ar(CEREAL_NVP(emptyClusterAction));
356 }
357 
358 } // namespace kmeans
359 } // namespace mlpack
void Cluster(const MatType &data, const size_t clusters, arma::Row< size_t > &assignments, const bool initialGuess=false)
Perform k-means clustering on the data, returning a list of cluster assignments.
Definition: kmeans_impl.hpp:124
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void serialize(Archive &ar, const uint32_t version)
Serialize the k-means object.
Definition: kmeans_impl.hpp:350
HAS_MEM_FUNC(Cluster, GivesCentroidsCheck)
This gives us a GivesCentroids object that we can use to tell whether or not an InitialPartitionPolic...
bool GetInitialAssignmentsOrCentroids(InitialPartitionPolicy &ipp, const MatType &data, const size_t clusters, arma::Row< size_t > &assignments, arma::mat &, const typename std::enable_if_t< !GivesCentroids< InitialPartitionPolicy >::value > *=0)
Call the initial partition policy, if it returns assignments.
Definition: kmeans_impl.hpp:49
static MLPACK_EXPORT util::PrefixedOutStream Warn
Prints warning messages prefixed with [WARN ].
Definition: log.hpp:87
static MLPACK_EXPORT util::PrefixedOutStream Info
Prints informational messages if –verbose is specified, prefixed with [INFO ].
Definition: log.hpp:84
&#39;value&#39; is true if the InitialPartitionPolicy class has a member Cluster(const arma::mat& data...
Definition: kmeans_impl.hpp:32
This class implements K-Means clustering, using a variety of possible implementations of Lloyd&#39;s algo...
Definition: kmeans.hpp:73
static void Assert(bool condition, const std::string &message="Assert Failed.")
Checks if the specified condition is true.
Definition: log.cpp:38