mlpack
max_variance_new_cluster_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_METHODS_KMEANS_MAX_VARIANCE_NEW_CLUSTER_IMPL_HPP
13 #define MLPACK_METHODS_KMEANS_MAX_VARIANCE_NEW_CLUSTER_IMPL_HPP
14 
15 // Just in case it has not been included.
17 
18 namespace mlpack {
19 namespace kmeans {
20 
24 template<typename MetricType, typename MatType>
26  const size_t emptyCluster,
27  const arma::mat& oldCentroids,
28  arma::mat& newCentroids,
29  arma::Col<size_t>& clusterCounts,
30  MetricType& metric,
31  const size_t iteration)
32 {
33  // If necessary, calculate the variances and assignments.
34  if (iteration != this->iteration || assignments.n_elem != data.n_cols)
35  Precalculate(data, oldCentroids, clusterCounts, metric);
36  this->iteration = iteration;
37 
38  // Now find the cluster with maximum variance.
39  arma::uword maxVarCluster = 0;
40  variances.max(maxVarCluster);
41 
42  // If the cluster with maximum variance has variance of 0, then we can't
43  // continue. All the points are the same.
44  if (variances[maxVarCluster] == 0.0)
45  return;
46 
47  // Now, inside this cluster, find the point which is furthest away.
48  size_t furthestPoint = data.n_cols;
49  double maxDistance = -DBL_MAX;
50  for (size_t i = 0; i < data.n_cols; ++i)
51  {
52  if (assignments[i] == maxVarCluster)
53  {
54  const double distance = std::pow(metric.Evaluate(data.col(i),
55  newCentroids.col(maxVarCluster)), 2.0);
56 
57  if (distance > maxDistance)
58  {
59  maxDistance = distance;
60  furthestPoint = i;
61  }
62  }
63  }
64 
65  // Take that point and add it to the empty cluster.
66  newCentroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
67  double(clusterCounts[maxVarCluster] - 1));
68  newCentroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] -
69  1.0)) * arma::vec(data.col(furthestPoint));
70  clusterCounts[maxVarCluster]--;
71  clusterCounts[emptyCluster]++;
72  newCentroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
73  assignments[furthestPoint] = emptyCluster;
74 
75  // Modify the variances, as necessary.
76  variances[emptyCluster] = 0;
77  // One has already been subtracted from clusterCounts[maxVarCluster]. If
78  // EmptyCluster() is called again, we can't pull another point from
79  // maxVarCluster (we'd be making an empty cluster), so force a call to
80  // Precalculate() if EmptyCluster() is called again by changing
81  // this->iteration.
82  if (clusterCounts[maxVarCluster] <= 1)
83  {
84  variances[maxVarCluster] = 0;
85  --this->iteration;
86  }
87  else
88  {
89  variances[maxVarCluster] = (1.0 / clusterCounts[maxVarCluster]) *
90  ((clusterCounts[maxVarCluster] + 1) * variances[maxVarCluster] -
91  maxDistance);
92  }
93 
94  // Output some debugging information.
95  Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
96  emptyCluster << ".\n";
97 }
98 
100 template<typename Archive>
101 void MaxVarianceNewCluster::serialize(Archive& /* ar */,
102  const uint32_t /* version */)
103 {
104  // Serialization is useless here, because the only thing we store is
105  // precalculated quantities, and if we're serializing, our precalculations are
106  // likely to be useless when we deserialize (because the user will be running
107  // a different clustering, probably). So there is no need to store anything,
108  // and if we are loading, we just reset the assignments array so
109  // precalculation will happen next time EmptyCluster() is called.
110  if (cereal::is_loading<Archive>())
111  assignments.set_size(0);
112 }
113 
114 template<typename MetricType, typename MatType>
115 void MaxVarianceNewCluster::Precalculate(const MatType& data,
116  const arma::mat& oldCentroids,
117  arma::Col<size_t>& clusterCounts,
118  MetricType& metric)
119 {
120  // We have to calculate the variances of each cluster and the assignments of
121  // each point. This is most easily done by iterating through the entire
122  // dataset.
123  variances.zeros(oldCentroids.n_cols);
124  assignments.set_size(data.n_cols);
125 
126  // Add the variance of each point's distance away from the cluster. I think
127  // this is the sensible thing to do.
128  for (size_t i = 0; i < data.n_cols; ++i)
129  {
130  // Find the closest centroid to this point.
131  double minDistance = std::numeric_limits<double>::infinity();
132  size_t closestCluster = oldCentroids.n_cols; // Invalid value.
133 
134  for (size_t j = 0; j < oldCentroids.n_cols; ++j)
135  {
136  const double distance = metric.Evaluate(data.col(i), oldCentroids.col(j));
137 
138  if (distance < minDistance)
139  {
140  minDistance = distance;
141  closestCluster = j;
142  }
143  }
144 
145  assignments[i] = closestCluster;
146  variances[closestCluster] += std::pow(metric.Evaluate(data.col(i),
147  oldCentroids.col(closestCluster)), 2.0);
148  }
149 
150  // Divide by the number of points in the cluster to produce the variance,
151  // unless the cluster is empty or contains only one point, in which case we
152  // set the variance to 0.
153  for (size_t i = 0; i < clusterCounts.n_elem; ++i)
154  if (clusterCounts[i] <= 1)
155  variances[i] = 0;
156  else
157  variances[i] /= clusterCounts[i];
158 }
159 
160 } // namespace kmeans
161 } // namespace mlpack
162 
163 #endif
static MLPACK_EXPORT util::NullOutStream Debug
MLPACK_EXPORT is required for global variables, so that they are properly exported by the Windows com...
Definition: log.hpp:79
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void EmptyCluster(const MatType &data, const size_t emptyCluster, const arma::mat &oldCentroids, arma::mat &newCentroids, arma::Col< size_t > &clusterCounts, MetricType &metric, const size_t iteration)
Take the point furthest from the centroid of the cluster with maximum variance to be a new cluster...
Definition: max_variance_new_cluster_impl.hpp:25
void serialize(Archive &ar, const uint32_t version)
Serialize the object.
Definition: max_variance_new_cluster_impl.hpp:101