mlpack
dbscan_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_METHODS_DBSCAN_DBSCAN_IMPL_HPP
13 #define MLPACK_METHODS_DBSCAN_DBSCAN_IMPL_HPP
14 
15 #include "dbscan.hpp"
16 
17 namespace mlpack {
18 namespace dbscan {
19 
23 template<typename RangeSearchType, typename PointSelectionPolicy>
25  const double epsilon,
26  const size_t minPoints,
27  const bool batchMode,
28  RangeSearchType rangeSearch,
29  PointSelectionPolicy pointSelector) :
30  epsilon(epsilon),
31  minPoints(minPoints),
32  batchMode(batchMode),
33  rangeSearch(rangeSearch),
34  pointSelector(pointSelector)
35 {
36  // Nothing to do.
37 }
38 
43 template<typename RangeSearchType, typename PointSelectionPolicy>
44 template<typename MatType>
46  const MatType& data,
47  arma::mat& centroids)
48 {
49  // These assignments will be thrown away, but there is no way to avoid
50  // calculating them.
51  arma::Row<size_t> assignments(data.n_cols);
52  assignments.fill(SIZE_MAX);
53 
54  return Cluster(data, assignments, centroids);
55 }
56 
61 template<typename RangeSearchType, typename PointSelectionPolicy>
62 template<typename MatType>
64  const MatType& data,
65  arma::Row<size_t>& assignments,
66  arma::mat& centroids)
67 {
68  const size_t numClusters = Cluster(data, assignments);
69 
70  // Now calculate the centroids.
71  centroids.zeros(data.n_rows, numClusters);
72 
73  // Calculate number of points in each cluster.
74  arma::Row<size_t> counts;
75  counts.zeros(numClusters);
76  for (size_t i = 0; i < data.n_cols; ++i)
77  {
78  if (assignments[i] != SIZE_MAX)
79  {
80  centroids.col(assignments[i]) += data.col(i);
81  ++counts[assignments[i]];
82  }
83  }
84 
85  // We should be guaranteed that the number of clusters is always greater than
86  // zero.
87  for (size_t i = 0; i < numClusters; ++i)
88  centroids.col(i) /= counts[i];
89 
90  return numClusters;
91 }
92 
97 template<typename RangeSearchType, typename PointSelectionPolicy>
98 template<typename MatType>
100  const MatType& data,
101  arma::Row<size_t>& assignments)
102 {
103  // Initialize the UnionFind object.
104  emst::UnionFind uf(data.n_cols);
105  rangeSearch.Train(data);
106 
107  if (batchMode)
108  BatchCluster(data, uf);
109  else
110  PointwiseCluster(data, uf);
111 
112  // Now set assignments.
113  assignments.set_size(data.n_cols);
114  for (size_t i = 0; i < data.n_cols; ++i)
115  assignments[i] = uf.Find(i);
116 
117  // Get a count of all clusters.
118  const size_t numClusters = arma::max(assignments) + 1;
119  arma::Col<size_t> counts(numClusters, arma::fill::zeros);
120  for (size_t i = 0; i < assignments.n_elem; ++i)
121  counts[assignments[i]]++;
122 
123  // Now assign clusters to new indices.
124  size_t currentCluster = 0;
125  arma::Col<size_t> newAssignments(numClusters);
126  for (size_t i = 0; i < counts.n_elem; ++i)
127  {
128  if (counts[i] >= minPoints)
129  newAssignments[i] = currentCluster++;
130  else
131  newAssignments[i] = SIZE_MAX;
132  }
133 
134  // Now reassign.
135  for (size_t i = 0; i < assignments.n_elem; ++i)
136  assignments[i] = newAssignments[assignments[i]];
137 
138  Log::Info << currentCluster << " clusters found." << std::endl;
139 
140  return currentCluster;
141 }
142 
149 template<typename RangeSearchType, typename PointSelectionPolicy>
150 template<typename MatType>
152  const MatType& data,
153  emst::UnionFind& uf)
154 {
155  std::vector<std::vector<size_t>> neighbors;
156  std::vector<std::vector<double>> distances;
157 
158  for (size_t i = 0; i < data.n_cols; ++i)
159  {
160  if (i % 10000 == 0 && i > 0)
161  Log::Info << "DBSCAN clustering on point " << i << "..." << std::endl;
162 
163  // Do the range search for only this point.
164  rangeSearch.Search(data.col(i), math::Range(0.0, epsilon), neighbors,
165  distances);
166 
167  // Union to all neighbors.
168  for (size_t j = 0; j < neighbors[0].size(); ++j)
169  uf.Union(i, neighbors[0][j]);
170  }
171 }
172 
178 template<typename RangeSearchType, typename PointSelectionPolicy>
179 template<typename MatType>
181  const MatType& data,
182  emst::UnionFind& uf)
183 {
184  // For each point, find the points in epsilon-nighborhood and their distances.
185  std::vector<std::vector<size_t>> neighbors;
186  std::vector<std::vector<double>> distances;
187  Log::Info << "Performing range search." << std::endl;
188  rangeSearch.Train(data);
189  rangeSearch.Search(data, math::Range(0.0, epsilon), neighbors, distances);
190  Log::Info << "Range search complete." << std::endl;
191 
192  // Now loop over all points.
193  for (size_t i = 0; i < data.n_cols; ++i)
194  {
195  // Get the next index.
196  const size_t index = pointSelector.Select(i, data);
197  for (size_t j = 0; j < neighbors[index].size(); ++j)
198  uf.Union(index, neighbors[index][j]);
199  }
200 }
201 
202 } // namespace dbscan
203 } // namespace mlpack
204 
205 #endif
A Union-Find data structure.
Definition: union_find.hpp:30
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a clustering technique descri...
Definition: dbscan.hpp:52
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void Union(const size_t x, const size_t y)
Union the components containing x and y.
Definition: union_find.hpp:76
DBSCAN(const double epsilon, const size_t minPoints, const bool batchMode=true, RangeSearchType rangeSearch=RangeSearchType(), PointSelectionPolicy pointSelector=PointSelectionPolicy())
Construct the DBSCAN object with the given parameters.
Definition: dbscan_impl.hpp:24
RangeType< double > Range
3.0.0 TODO: break reverse-compatibility by changing RangeType to Range.
Definition: range.hpp:19
size_t Cluster(const MatType &data, arma::mat &centroids)
Performs DBSCAN clustering on the data, returning number of clusters and also the centroid of each cl...
Definition: dbscan_impl.hpp:45
size_t Find(const size_t x)
Returns the component containing an element.
Definition: union_find.hpp:56
static MLPACK_EXPORT util::PrefixedOutStream Info
Prints informational messages if –verbose is specified, prefixed with [INFO ].
Definition: log.hpp:84