mlpack
refined_start_impl.hpp
Go to the documentation of this file.
1 
14 #ifndef MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
15 #define MLPACK_METHODS_KMEANS_REFINED_START_IMPL_HPP
16 
17 // In case it hasn't been included yet.
18 #include "refined_start.hpp"
19 
20 namespace mlpack {
21 namespace kmeans {
22 
24 template<typename MatType>
25 void RefinedStart::Cluster(const MatType& data,
26  const size_t clusters,
27  arma::mat& centroids) const
28 {
29  // This will hold the sampled datasets.
30  const size_t numPoints = size_t(percentage * data.n_cols);
31  MatType sampledData(data.n_rows, numPoints);
32  // vector<bool> is packed so each bool is 1 bit.
33  std::vector<bool> pointsUsed(data.n_cols, false);
34  arma::mat sampledCentroids(data.n_rows, samplings * clusters);
35 
36  for (size_t i = 0; i < samplings; ++i)
37  {
38  // First, assemble the sampled dataset.
39  size_t curSample = 0;
40  while (curSample < numPoints)
41  {
42  // Pick a random point in [0, numPoints).
43  size_t sample = (size_t) math::RandInt(data.n_cols);
44 
45  if (!pointsUsed[sample])
46  {
47  // This point isn't used yet. So we'll put it in our sample.
48  pointsUsed[sample] = true;
49  sampledData.col(curSample) = data.col(sample);
50  ++curSample;
51  }
52  }
53 
54  // Now, using the sampled dataset, run k-means. In the case of an empty
55  // cluster, we re-initialize that cluster as the point furthest away from
56  // the cluster with maximum variance. This is not *exactly* what the paper
57  // implements, but it is quite similar, and we'll call it "good enough".
58  KMeans<> kmeans;
59  kmeans.Cluster(sampledData, clusters, centroids);
60 
61  // Store the sampled centroids.
62  sampledCentroids.cols(i * clusters, (i + 1) * clusters - 1) = centroids;
63 
64  pointsUsed.assign(data.n_cols, false);
65  }
66 
67  // Now, we run k-means on the sampled centroids to get our final clusters.
68  KMeans<> kmeans;
69  kmeans.Cluster(sampledCentroids, clusters, centroids);
70 }
71 
72 template<typename MatType>
73 void RefinedStart::Cluster(const MatType& data,
74  const size_t clusters,
75  arma::Row<size_t>& assignments) const
76 {
77  // Perform the Bradley-Fayyad refined start algorithm, and get initial
78  // centroids back.
79  arma::mat centroids;
80  Cluster(data, clusters, centroids);
81 
82  // Turn the final centroids into assignments.
83  assignments.set_size(data.n_cols);
84  for (size_t i = 0; i < data.n_cols; ++i)
85  {
86  // Find the closest centroid to this point.
87  double minDistance = std::numeric_limits<double>::infinity();
88  size_t closestCluster = clusters;
89 
90  for (size_t j = 0; j < clusters; ++j)
91  {
92  // This is restricted to the L2 distance, and unfortunately it would take
93  // a lot of refactoring and redesign to make this more general... we would
94  // probably need to have KMeans take a template template parameter for the
95  // initial partition policy. It's not clear how to best do this.
96  const double distance = metric::EuclideanDistance::Evaluate(data.col(i),
97  centroids.col(j));
98 
99  if (distance < minDistance)
100  {
101  minDistance = distance;
102  closestCluster = j;
103  }
104  }
105 
106  // Assign the point to its closest cluster.
107  assignments[i] = closestCluster;
108  }
109 }
110 
111 } // namespace kmeans
112 } // namespace mlpack
113 
114 #endif
void Cluster(const MatType &data, const size_t clusters, arma::Row< size_t > &assignments, const bool initialGuess=false)
Perform k-means clustering on the data, returning a list of cluster assignments.
Definition: kmeans_impl.hpp:124
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void Cluster(const MatType &data, const size_t clusters, arma::mat &centroids) const
Partition the given dataset into the given number of clusters according to the random sampling scheme...
Definition: refined_start_impl.hpp:25
static VecTypeA::elem_type Evaluate(const VecTypeA &a, const VecTypeB &b)
Computes the distance between two points.
Definition: lmetric_impl.hpp:24
int RandInt(const int hiExclusive)
Generates a uniform random integer.
Definition: random.hpp:110
This class implements K-Means clustering, using a variety of possible implementations of Lloyd&#39;s algo...
Definition: kmeans.hpp:73