mlpack
em_fit_impl.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_METHODS_GMM_EM_FIT_IMPL_HPP
14 #define MLPACK_METHODS_GMM_EM_FIT_IMPL_HPP
15 
16 // In case it hasn't been included yet.
17 #include "em_fit.hpp"
18 #include "diagonal_constraint.hpp"
20 
21 namespace mlpack {
22 namespace gmm {
23 
25 template<typename InitialClusteringType,
26  typename CovarianceConstraintPolicy,
27  typename Distribution>
29  const size_t maxIterations,
30  const double tolerance,
31  InitialClusteringType clusterer,
32  CovarianceConstraintPolicy constraint) :
33  maxIterations(maxIterations),
34  tolerance(tolerance),
35  clusterer(clusterer),
36  constraint(constraint)
37 { /* Nothing to do. */ }
38 
39 template<typename InitialClusteringType,
40  typename CovarianceConstraintPolicy,
41  typename Distribution>
43 Estimate(const arma::mat& observations,
44  std::vector<Distribution>& dists,
45  arma::vec& weights,
46  const bool useInitialModel)
47 {
48  if (std::is_same<Distribution,
50  {
51  #ifdef _WIN32
52  Log::Warn << "Cannot use arma::gmm_diag on Visual Studio due to OpenMP"
53  << " compilation issues! Using slower EMFit::Estimate() instead..."
54  << std::endl;
55  #else
56  ArmadilloGMMWrapper(observations, dists, weights, useInitialModel);
57  return;
58  #endif
59  }
60  else if (std::is_same<CovarianceConstraintPolicy, DiagonalConstraint>::value
61  && std::is_same<Distribution, distribution::GaussianDistribution>::value)
62  {
63  // EMFit::Estimate() using DiagonalConstraint with GaussianDistribution
64  // makes use of slower implementation.
65  Log::Warn << "EMFit::Estimate() using DiagonalConstraint with "
66  << "GaussianDistribution makes use of slower implementation, so "
67  << "DiagonalGMM is recommended for faster training." << std::endl;
68  }
69 
70  // Only perform initial clustering if the user wanted it.
71  if (!useInitialModel)
72  InitialClustering(observations, dists, weights);
73 
74  double l = LogLikelihood(observations, dists, weights);
75 
76  Log::Debug << "EMFit::Estimate(): initial clustering log-likelihood: "
77  << l << std::endl;
78 
79  double lOld = -DBL_MAX;
80  arma::mat condLogProb(observations.n_cols, dists.size());
81 
82  // Iterate to update the model until no more improvement is found.
83  size_t iteration = 1;
84  while (std::abs(l - lOld) > tolerance && iteration != maxIterations)
85  {
86  Log::Info << "EMFit::Estimate(): iteration " << iteration << ", "
87  << "log-likelihood " << l << "." << std::endl;
88 
89  // Calculate the conditional probabilities of choosing a particular
90  // Gaussian given the observations and the present theta value.
91  for (size_t i = 0; i < dists.size(); ++i)
92  {
93  // Store conditional log probabilities into condLogProb vector for each
94  // Gaussian. First we make an alias of the condLogProb vector.
95  arma::vec condLogProbAlias = condLogProb.unsafe_col(i);
96  dists[i].LogProbability(observations, condLogProbAlias);
97  condLogProbAlias += log(weights[i]);
98  }
99 
100  // Normalize row-wise.
101  for (size_t i = 0; i < condLogProb.n_rows; ++i)
102  {
103  // Avoid dividing by zero; if the probability for everything is 0, we
104  // don't want to make it NaN.
105  const double probSum = mlpack::math::AccuLog(condLogProb.row(i));
106  if (probSum != -std::numeric_limits<double>::infinity())
107  condLogProb.row(i) -= probSum;
108  }
109 
110  // Store the sum of the probability of each state over all the observations.
111  arma::vec probRowSums(dists.size());
112  for (size_t i = 0; i < dists.size(); ++i)
113  {
114  probRowSums(i) = mlpack::math::AccuLog(condLogProb.col(i));
115  }
116 
117  // Calculate the new value of the means using the updated conditional
118  // probabilities.
119  for (size_t i = 0; i < dists.size(); ++i)
120  {
121  // Don't update if there's no probability of the Gaussian having points.
122  if (probRowSums[i] != -std::numeric_limits<double>::infinity())
123  dists[i].Mean() = observations * arma::exp(condLogProb.col(i) -
124  probRowSums[i]);
125  else
126  continue;
127 
128  // Calculate the new value of the covariances using the updated
129  // conditional probabilities and the updated means.
130  arma::mat tmp = observations.each_col() - dists[i].Mean();
131 
132  // If the distribution is DiagonalGaussianDistribution, calculate the
133  // covariance only with diagonal components.
134  if (std::is_same<Distribution,
136  {
137  arma::vec covariance = arma::sum((tmp % tmp) %
138  (arma::ones<arma::vec>(observations.n_rows) *
139  trans(arma::exp(condLogProb.col(i) - probRowSums[i]))), 1);
140 
141  // Apply covariance constraint.
142  constraint.ApplyConstraint(covariance);
143  dists[i].Covariance(std::move(covariance));
144  }
145  else
146  {
147  arma::mat tmpB = tmp.each_row() % trans(arma::exp(condLogProb.col(i) -
148  probRowSums[i]));
149  arma::mat covariance = tmp * trans(tmpB);
150 
151  // Apply covariance constraint.
152  constraint.ApplyConstraint(covariance);
153  dists[i].Covariance(std::move(covariance));
154  }
155  }
156 
157  // Calculate the new values for omega using the updated conditional
158  // probabilities.
159  weights = arma::exp(probRowSums - std::log(observations.n_cols));
160 
161  // Update values of l; calculate new log-likelihood.
162  lOld = l;
163  l = LogLikelihood(observations, dists, weights);
164 
165  iteration++;
166  }
167 }
168 
169 template<typename InitialClusteringType,
170  typename CovarianceConstraintPolicy,
171  typename Distribution>
173 Estimate(const arma::mat& observations,
174  const arma::vec& probabilities,
175  std::vector<Distribution>& dists,
176  arma::vec& weights,
177  const bool useInitialModel)
178 {
179  if (!useInitialModel)
180  InitialClustering(observations, dists, weights);
181 
182  double l = LogLikelihood(observations, dists, weights);
183 
184  Log::Debug << "EMFit::Estimate(): initial clustering log-likelihood: "
185  << l << std::endl;
186 
187  double lOld = -DBL_MAX;
188  arma::mat condLogProb(observations.n_cols, dists.size());
189 
190  // Iterate to update the model until no more improvement is found.
191  size_t iteration = 1;
192  while (std::abs(l - lOld) > tolerance && iteration != maxIterations)
193  {
194  // Calculate the conditional probabilities of choosing a particular
195  // Gaussian given the observations and the present theta value.
196  for (size_t i = 0; i < dists.size(); ++i)
197  {
198  // Store conditional log probabilities into condLogProb vector for each
199  // Gaussian. First we make an alias of the condLogProb vector.
200  arma::vec condLogProbAlias = condLogProb.unsafe_col(i);
201  dists[i].LogProbability(observations, condLogProbAlias);
202  condLogProbAlias += log(weights[i]);
203  }
204 
205  // Normalize row-wise.
206  for (size_t i = 0; i < condLogProb.n_rows; ++i)
207  {
208  // Avoid dividing by zero; if the probability for everything is 0, we
209  // don't want to make it NaN.
210  const double probSum = mlpack::math::AccuLog(condLogProb.row(i));
211  if (probSum != -std::numeric_limits<double>::infinity())
212  condLogProb.row(i) -= probSum;
213  }
214 
215  // This will store the sum of probabilities of each state over all the
216  // observations.
217  arma::vec probRowSums(dists.size());
218 
219  // Calculate the new value of the means using the updated conditional
220  // probabilities.
221  arma::vec logProbabilities = arma::log(probabilities);
222  for (size_t i = 0; i < dists.size(); ++i)
223  {
224  // Calculate the sum of probabilities of points, which is the
225  // conditional probability of each point being from Gaussian i
226  // multiplied by the probability of the point being from this mixture
227  // model.
228  arma::vec tmpProb = condLogProb.col(i) + logProbabilities;
229  probRowSums[i] = mlpack::math::AccuLog(tmpProb);
230 
231  // Don't update if there's no probability of the Gaussian having points.
232  if (probRowSums[i] != -std::numeric_limits<double>::infinity())
233  {
234  dists[i].Mean() = observations *
235  arma::exp(condLogProb.col(i) + logProbabilities - probRowSums[i]);
236  }
237  else
238  continue;
239 
240  // Calculate the new value of the covariances using the updated
241  // conditional probabilities and the updated means.
242  arma::mat tmp = observations.each_col() - dists[i].Mean();
243 
244  // If the distribution is DiagonalGaussianDistribution, calculate the
245  // covariance only with diagonal components.
246  if (std::is_same<Distribution,
248  {
249  arma::vec cov = arma::sum((tmp % tmp) %
250  (arma::ones<arma::vec>(observations.n_rows) *
251  trans(arma::exp(condLogProb.col(i) +
252  logProbabilities - probRowSums[i]))), 1);
253 
254  // Apply covariance constraint.
255  constraint.ApplyConstraint(cov);
256  dists[i].Covariance(std::move(cov));
257  }
258  else
259  {
260  arma::mat tmpB = tmp.each_row() % trans(arma::exp(condLogProb.col(i) +
261  logProbabilities - probRowSums[i]));
262  arma::mat cov = (tmp * trans(tmpB));
263 
264  // Apply covariance constraint.
265  constraint.ApplyConstraint(cov);
266  dists[i].Covariance(std::move(cov));
267  }
268  }
269 
270  // Calculate the new values for omega using the updated conditional
271  // probabilities.
272  weights = arma::exp(probRowSums - mlpack::math::AccuLog(logProbabilities));
273 
274  // Update values of l; calculate new log-likelihood.
275  lOld = l;
276  l = LogLikelihood(observations, dists, weights);
277 
278  iteration++;
279  }
280 }
281 
282 template<typename InitialClusteringType,
283  typename CovarianceConstraintPolicy,
284  typename Distribution>
286 InitialClustering(const arma::mat& observations,
287  std::vector<Distribution>& dists,
288  arma::vec& weights)
289 {
290  // Assignments from clustering.
291  arma::Row<size_t> assignments;
292 
293  // Run clustering algorithm.
294  clusterer.Cluster(observations, dists.size(), assignments);
295 
296  // Check if the type of Distribution is DiagonalGaussianDistribution. If so,
297  // we can get faster performance by using diagonal elements when calculating
298  // the covariance.
299  const bool isDiagGaussDist = std::is_same<Distribution,
301 
302  std::vector<arma::vec> means(dists.size());
303 
304  // Conditional covariance instantiation.
305  std::vector<typename std::conditional<isDiagGaussDist,
306  arma::vec, arma::mat>::type> covs(dists.size());
307 
308  // Now calculate the means, covariances, and weights.
309  weights.zeros();
310  for (size_t i = 0; i < dists.size(); ++i)
311  {
312  means[i].zeros(dists[i].Mean().n_elem);
313  if (isDiagGaussDist)
314  {
315  covs[i].zeros(dists[i].Covariance().n_elem);
316  }
317  else
318  {
319  covs[i].zeros(dists[i].Covariance().n_rows,
320  dists[i].Covariance().n_cols);
321  }
322  }
323 
324  // From the assignments, generate our means, covariances, and weights.
325  for (size_t i = 0; i < observations.n_cols; ++i)
326  {
327  const size_t cluster = assignments[i];
328 
329  // Add this to the relevant mean.
330  means[cluster] += observations.col(i);
331 
332  // Add this to the relevant covariance.
333  if (isDiagGaussDist)
334  covs[cluster] += observations.col(i) % observations.col(i);
335  else
336  covs[cluster] += observations.col(i) * trans(observations.col(i));
337 
338  // Now add one to the weights (we will normalize).
339  weights[cluster]++;
340  }
341 
342  // Now normalize the mean and covariance.
343  for (size_t i = 0; i < dists.size(); ++i)
344  {
345  means[i] /= (weights[i] > 1) ? weights[i] : 1;
346  }
347 
348  for (size_t i = 0; i < observations.n_cols; ++i)
349  {
350  const size_t cluster = assignments[i];
351  const arma::vec normObs = observations.col(i) - means[cluster];
352  if (isDiagGaussDist)
353  covs[cluster] += normObs % normObs;
354  else
355  covs[cluster] += normObs * normObs.t();
356  }
357 
358  for (size_t i = 0; i < dists.size(); ++i)
359  {
360  covs[i] /= (weights[i] > 1) ? weights[i] : 1;
361 
362  // Apply constraints to covariance matrix.
363  if (isDiagGaussDist)
364  covs[i] = arma::clamp(covs[i], 1e-10, DBL_MAX);
365  else
366  constraint.ApplyConstraint(covs[i]);
367 
368  std::swap(dists[i].Mean(), means[i]);
369  dists[i].Covariance(std::move(covs[i]));
370  }
371 
372  // Finally, normalize weights.
373  weights /= accu(weights);
374 }
375 
376 template<typename InitialClusteringType,
377  typename CovarianceConstraintPolicy,
378  typename Distribution>
380 LogLikelihood(const arma::mat& observations,
381  const std::vector<Distribution>& dists,
382  const arma::vec& weights) const
383 {
384  double logLikelihood = 0;
385 
386  arma::vec logPhis;
387  arma::mat logLikelihoods(dists.size(), observations.n_cols);
388 
389  // It has to be LogProbability() otherwise Probability() would overflow easily
390  for (size_t i = 0; i < dists.size(); ++i)
391  {
392  dists[i].LogProbability(observations, logPhis);
393  logLikelihoods.row(i) = log(weights(i)) + trans(logPhis);
394  }
395  // Now sum over every point.
396  for (size_t j = 0; j < observations.n_cols; ++j)
397  {
398  if (mlpack::math::AccuLog(logLikelihoods.col(j)) ==
399  -std::numeric_limits<double>::infinity())
400  {
401  Log::Info << "Likelihood of point " << j << " is 0! It is probably an "
402  << "outlier." << std::endl;
403  }
404  logLikelihood += mlpack::math::AccuLog(logLikelihoods.col(j));
405  }
406 
407  return logLikelihood;
408 }
409 
410 template<typename InitialClusteringType,
411  typename CovarianceConstraintPolicy,
412  typename Distribution>
413 template<typename Archive>
415 serialize(Archive& ar, const uint32_t /* version */)
416 {
417  ar(CEREAL_NVP(maxIterations));
418  ar(CEREAL_NVP(tolerance));
419  ar(CEREAL_NVP(clusterer));
420  ar(CEREAL_NVP(constraint));
421 }
422 
423 template<typename InitialClusteringType,
424  typename CovarianceConstraintPolicy,
425  typename Distribution>
427 ArmadilloGMMWrapper(const arma::mat& observations,
428  std::vector<Distribution>& dists,
429  arma::vec& weights,
430  const bool useInitialModel)
431 {
432  arma::gmm_diag g;
433 
434  // Warn the user that tolerance isn't used for convergence here if they've
435  // specified a non-default value.
436  if (tolerance != EMFit().Tolerance())
437  Log::Warn << "GMM::Train(): tolerance ignored when training GMMs with "
438  << "DiagonalConstraint." << std::endl;
439 
440  // If the initial clustering is the default k-means, we'll just use
441  // Armadillo's implementation. If mlpack ever changes k-means defaults to use
442  // something that is reliably quicker than the Lloyd iteration k-means update,
443  // then this code maybe should be revisited.
444  if (!std::is_same<InitialClusteringType, mlpack::kmeans::KMeans<>>::value ||
445  useInitialModel)
446  {
447  // Use clusterer to get initial values.
448  if (!useInitialModel)
449  InitialClustering(observations, dists, weights);
450 
451  // Assemble matrix of means.
452  arma::mat means(observations.n_rows, dists.size());
453  arma::mat covs(observations.n_rows, dists.size());
454  for (size_t i = 0; i < dists.size(); ++i)
455  {
456  means.col(i) = dists[i].Mean();
457 
458  // DiagonalGaussianDistribution has diagonal covariance as an arma::vec.
459  covs.col(i) = dists[i].Covariance();
460  }
461 
462  g.reset(observations.n_rows, dists.size());
463  g.set_params(std::move(means), std::move(covs), weights.t());
464 
465  g.learn(observations, dists.size(), arma::eucl_dist, arma::keep_existing, 0,
466  maxIterations, 1e-10, false /* no printing */);
467  }
468  else
469  {
470  // Use Armadillo for the initial clustering. We'll try and match mlpack
471  // defaults.
472  g.learn(observations, dists.size(), arma::eucl_dist, arma::static_subset,
473  1000, maxIterations, 1e-10, false /* no printing */);
474  }
475 
476  // Extract means, covariances, and weights.
477  weights = g.hefts.t();
478  for (size_t i = 0; i < dists.size(); ++i)
479  {
480  dists[i].Mean() = g.means.col(i);
481 
482  // Apply covariance constraint.
483  arma::vec covsAlias = g.dcovs.unsafe_col(i);
484  constraint.ApplyConstraint(covsAlias);
485 
486  // DiagonalGaussianDistribution has diagonal covariance as an arma::vec.
487  dists[i].Covariance(g.dcovs.col(i));
488  }
489 }
490 
491 } // namespace gmm
492 } // namespace mlpack
493 
494 #endif
This class contains methods which can fit a GMM to observations using the EM algorithm.
Definition: em_fit.hpp:45
static MLPACK_EXPORT util::NullOutStream Debug
MLPACK_EXPORT is required for global variables, so that they are properly exported by the Windows com...
Definition: log.hpp:79
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
EMFit(const size_t maxIterations=300, const double tolerance=1e-10, InitialClusteringType clusterer=InitialClusteringType(), CovarianceConstraintPolicy constraint=CovarianceConstraintPolicy())
Construct the EMFit object, optionally passing an InitialClusteringType object (just in case it needs...
Definition: em_fit_impl.hpp:28
static MLPACK_EXPORT util::PrefixedOutStream Warn
Prints warning messages prefixed with [WARN ].
Definition: log.hpp:87
void serialize(Archive &ar, const uint32_t version)
Serialize the fitter.
Definition: em_fit_impl.hpp:415
double Tolerance() const
Get the tolerance for the convergence of the EM algorithm.
Definition: em_fit.hpp:126
T::elem_type AccuLog(const T &x)
Log-sum a vector of log values.
Definition: log_add_impl.hpp:63
static MLPACK_EXPORT util::PrefixedOutStream Info
Prints informational messages if –verbose is specified, prefixed with [INFO ].
Definition: log.hpp:84
void Estimate(const arma::mat &observations, std::vector< Distribution > &dists, arma::vec &weights, const bool useInitialModel=false)
Fit the observations to a Gaussian mixture model (GMM) using the EM algorithm.
Definition: em_fit_impl.hpp:43
This class implements K-Means clustering, using a variety of possible implementations of Lloyd&#39;s algo...
Definition: kmeans.hpp:73
A single multivariate Gaussian distribution with diagonal covariance.
Definition: diagonal_gaussian_distribution.hpp:21