mlpack
naive_bayes_classifier_impl.hpp
Go to the documentation of this file.
1 
17 #ifndef MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
18 #define MLPACK_METHODS_NAIVE_BAYES_NAIVE_BAYES_CLASSIFIER_IMPL_HPP
19 
20 #include <mlpack/prereqs.hpp>
21 
22 // In case it hasn't been included already.
24 
25 namespace mlpack {
26 namespace naive_bayes {
27 
28 template<typename ModelMatType>
29 template<typename MatType>
31  const MatType& data,
32  const arma::Row<size_t>& labels,
33  const size_t numClasses,
34  const bool incremental,
35  const double epsilon) :
36  trainingPoints(0), // Set when we call Train().
37  epsilon(epsilon)
38 {
39  static_assert(std::is_same<ElemType, typename MatType::elem_type>::value,
40  "NaiveBayesClassifier: element type of given data must match the element "
41  "type of the model!");
42 
43  // Perform training, after initializing the model to 0 (that is, if Train()
44  // won't do that for us, which it won't if we're using the incremental
45  // algorithm).
46  if (incremental)
47  {
48  probabilities.zeros(numClasses);
49  means.zeros(data.n_rows, numClasses);
50  variances.zeros(data.n_rows, numClasses);
51  }
52  else
53  {
54  probabilities.set_size(numClasses);
55  means.set_size(data.n_rows, numClasses);
56  variances.set_size(data.n_rows, numClasses);
57  }
58 
59  Train(data, labels, numClasses, incremental);
60 }
61 
62 template<typename ModelMatType>
64  const size_t dimensionality,
65  const size_t numClasses,
66  const double epsilon) :
67  trainingPoints(0),
68  epsilon(epsilon)
69 {
70  // Initialize model to 0.
71  probabilities.zeros(numClasses);
72  means.zeros(dimensionality, numClasses);
73  variances.zeros(dimensionality, numClasses);
74 }
75 
76 template<typename ModelMatType>
77 template<typename MatType>
79  const MatType& data,
80  const arma::Row<size_t>& labels,
81  const size_t numClasses,
82  const bool incremental)
83 {
84  static_assert(std::is_same<ElemType, typename MatType::elem_type>::value,
85  "NaiveBayesClassifier: element type of given data must match the element "
86  "type of the model!");
87 
88  // Do we need to resize the model?
89  if (probabilities.n_elem != numClasses)
90  {
91  // Perform training, after initializing the model to 0 (that is, if Train()
92  // won't do that for us, which it won't if we're using the incremental
93  // algorithm).
94  if (incremental)
95  {
96  probabilities.zeros(numClasses);
97  means.zeros(data.n_rows, numClasses);
98  variances.zeros(data.n_rows, numClasses);
99  }
100  else
101  {
102  probabilities.set_size(numClasses);
103  means.set_size(data.n_rows, numClasses);
104  variances.set_size(data.n_rows, numClasses);
105  }
106  }
107 
108  // Calculate the class probabilities as well as the sample mean and variance
109  // for each of the features with respect to each of the labels.
110  if (incremental)
111  {
112  // Use incremental algorithm.
113  // Fist, de-normalize probabilities.
114  probabilities *= trainingPoints;
115 
116  for (size_t j = 0; j < data.n_cols; ++j)
117  {
118  const size_t label = labels[j];
119  ++probabilities[label];
120 
121  arma::vec delta = data.col(j) - means.col(label);
122  means.col(label) += delta / probabilities[label];
123  variances.col(label) += delta % (data.col(j) - means.col(label));
124  }
125 
126  for (size_t i = 0; i < probabilities.n_elem; ++i)
127  {
128  if (probabilities[i] > 2)
129  variances.col(i) /= (probabilities[i] - 1);
130  }
131  }
132  else
133  {
134  // Set all parameters to zero.
135  probabilities.zeros();
136  means.zeros();
137  variances.zeros();
138 
139  // Don't use incremental algorithm. This is a two-pass algorithm. It is
140  // possible to calculate the means and variances using a faster one-pass
141  // algorithm but there are some precision and stability issues. If this is
142  // too slow, it's an option to use the faster algorithm by default and then
143  // have this (and the incremental algorithm) be other options.
144 
145  // Calculate the means.
146  for (size_t j = 0; j < data.n_cols; ++j)
147  {
148  const size_t label = labels[j];
149  ++probabilities[label];
150  means.col(label) += data.col(j);
151  }
152 
153  // Normalize means.
154  for (size_t i = 0; i < probabilities.n_elem; ++i)
155  if (probabilities[i] != 0.0)
156  means.col(i) /= probabilities[i];
157 
158  // Calculate variances.
159  for (size_t j = 0; j < data.n_cols; ++j)
160  {
161  const size_t label = labels[j];
162  variances.col(label) += square(data.col(j) - means.col(label));
163  }
164 
165  // Normalize variances.
166  for (size_t i = 0; i < probabilities.n_elem; ++i)
167  if (probabilities[i] > 1)
168  variances.col(i) /= (probabilities[i] - 1);
169  }
170 
171  // Add epsilon to prevent log of zero.
172  variances += epsilon;
173 
174  probabilities /= data.n_cols;
175  trainingPoints += data.n_cols;
176 }
177 
178 template<typename ModelMatType>
179 template<typename VecType>
181  const size_t label)
182 {
183  static_assert(std::is_same<ElemType, typename VecType::elem_type>::value,
184  "NaiveBayesClassifier: element type of given data must match the element "
185  "type of the model!");
186 
187  // We must use the incremental algorithm here.
188  probabilities *= trainingPoints;
189  probabilities[label]++;
190 
191  arma::vec delta = point - means.col(label);
192  means.col(label) += delta / probabilities[label];
193  if (probabilities[label] > 2)
194  variances.col(label) *= (probabilities[label] - 2);
195  variances.col(label) += (delta % (point - means.col(label)));
196  if (probabilities[label] > 1)
197  variances.col(label) /= probabilities[label] - 1;
198 
199  trainingPoints++;
200  probabilities /= trainingPoints;
201 }
202 
203 template<typename ModelMatType>
204 template<typename MatType>
206  const MatType& data,
207  ModelMatType& logLikelihoods) const
208 {
209  static_assert(std::is_same<ElemType, typename MatType::elem_type>::value,
210  "NaiveBayesClassifier: element type of given data must match the element "
211  "type of the model!");
212 
213  logLikelihoods = arma::log(arma::repmat(probabilities, 1, data.n_cols));
214  ModelMatType invVar = 1.0 / variances;
215 
216  // Calculate the joint log likelihood of point for each of the
217  // means.n_cols.
218 
219  // Loop over every class.
220  for (size_t i = 0; i < means.n_cols; ++i)
221  {
222  // This is an adaptation of gmm::phi() for the case where the covariance is
223  // a diagonal matrix.
224  ModelMatType diffs = data - arma::repmat(means.col(i), 1, data.n_cols);
225  ModelMatType rhs = -0.5 * arma::diagmat(invVar.col(i)) * diffs;
226  arma::Mat<ElemType> exponents = arma::sum(diffs % rhs, 0);
227 
228  logLikelihoods.row(i) += (data.n_rows / -2.0 * log(2 * M_PI) - 0.5 *
229  arma::accu(arma::log(variances.col(i))) + exponents);
230  }
231 }
232 
233 template<typename ModelMatType>
234 template<typename VecType>
235 size_t NaiveBayesClassifier<ModelMatType>::Classify(const VecType& point) const
236 {
237  static_assert(std::is_same<ElemType, typename VecType::elem_type>::value,
238  "NaiveBayesClassifier: element type of given data must match the element "
239  "type of the model!");
240 
241  // Find the label(class) with max log likelihood.
242  ModelMatType logLikelihoods;
243  LogLikelihood(point, logLikelihoods);
244 
245  arma::uword maxIndex = 0;
246  logLikelihoods.max(maxIndex);
247  return maxIndex;
248 }
249 
250 template<typename ModelMatType>
251 template<typename VecType, typename ProbabilitiesVecType>
253  const VecType& point,
254  size_t& prediction,
255  ProbabilitiesVecType& probabilities) const
256 {
257  static_assert(std::is_same<ElemType, typename VecType::elem_type>::value,
258  "NaiveBayesClassifier: element type of given data must match the element "
259  "type of the model!");
260  static_assert(std::is_same<ElemType,
261  typename ProbabilitiesVecType::elem_type>::value,
262  "NaiveBayesClassifier: element type of given data must match the element "
263  "type of the model!");
264 
265  // log(Prob(Y|X)) = Log(Prob(X|Y)) + Log(Prob(Y)) - Log(Prob(X));
266  // But LogLikelihood() gives us the unnormalized log likelihood which is
267  // Log(Prob(X|Y)) + Log(Prob(Y)) so we need to subtract the normalization
268  // term.
269  ModelMatType logLikelihoods;
270  LogLikelihood(point, logLikelihoods);
271 
272  // To prevent underflow in log of sum of exp of x operation (where x is a
273  // small negative value), we use logsumexp(x - max(x)) + max(x).
274  const double maxValue = arma::max(logLikelihoods);
275  const double logProbX = log(arma::accu(exp(logLikelihoods - maxValue))) +
276  maxValue;
277  probabilities = exp(logLikelihoods - logProbX); // log(exp(value)) == value.
278 
279  arma::uword maxIndex = 0;
280  logLikelihoods.max(maxIndex);
281  prediction = (size_t) maxIndex;
282 }
283 
284 template<typename ModelMatType>
285 template<typename MatType>
287  const MatType& data,
288  arma::Row<size_t>& predictions) const
289 {
290  static_assert(std::is_same<ElemType, typename MatType::elem_type>::value,
291  "NaiveBayesClassifier: element type of given data must match the element "
292  "type of the model!");
293 
294  predictions.set_size(data.n_cols);
295 
296  ModelMatType logLikelihoods;
297  LogLikelihood(data, logLikelihoods);
298 
299  for (size_t i = 0; i < data.n_cols; ++i)
300  {
301  arma::uword maxIndex = 0;
302  logLikelihoods.unsafe_col(i).max(maxIndex);
303  predictions[i] = maxIndex;
304  }
305 }
306 
307 template<typename ModelMatType>
308 template<typename MatType, typename ProbabilitiesMatType>
310  const MatType& data,
311  arma::Row<size_t>& predictions,
312  ProbabilitiesMatType& predictionProbs) const
313 {
314  static_assert(std::is_same<ElemType, typename MatType::elem_type>::value,
315  "NaiveBayesClassifier: element type of given data must match the element "
316  "type of the model!");
317  static_assert(std::is_same<ElemType,
318  typename ProbabilitiesMatType::elem_type>::value,
319  "NaiveBayesClassifier: element type of given data must match the element "
320  "type of the model!");
321 
322  predictions.set_size(data.n_cols);
323 
324  ModelMatType logLikelihoods;
325  LogLikelihood(data, logLikelihoods);
326 
327  predictionProbs.set_size(arma::size(logLikelihoods));
328  double maxValue, logProbX;
329  for (size_t j = 0; j < data.n_cols; ++j)
330  {
331  // The LogLikelihood() gives us the unnormalized log likelihood which is
332  // Log(Prob(X|Y)) + Log(Prob(Y)), so we subtract the normalization term.
333  // Besides, to prevent underflow in log of sum of exp of x operation (where
334  // x is a small negative value), we use logsumexp(x - max(x)) + max(x).
335  maxValue = arma::max(logLikelihoods.col(j));
336  logProbX = log(arma::accu(exp(logLikelihoods.col(j) -
337  maxValue))) + maxValue;
338  predictionProbs.col(j) = arma::exp(logLikelihoods.col(j) - logProbX);
339  }
340 
341  // Now calculate maximum probabilities for each point.
342  for (size_t i = 0; i < data.n_cols; ++i)
343  {
344  arma::uword maxIndex = 0;
345  logLikelihoods.unsafe_col(i).max(maxIndex);
346  predictions[i] = maxIndex;
347  }
348 }
349 
350 template<typename ModelMatType>
351 template<typename Archive>
353  Archive& ar,
354  const uint32_t /* version */)
355 {
356  ar(CEREAL_NVP(means));
357  ar(CEREAL_NVP(variances));
358  ar(CEREAL_NVP(probabilities));
359 }
360 
361 } // namespace naive_bayes
362 } // namespace mlpack
363 
364 #endif
void serialize(Archive &ar, const uint32_t)
Serialize the classifier.
Definition: naive_bayes_classifier_impl.hpp:352
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void Train(const MatType &data, const arma::Row< size_t > &labels, const size_t numClasses, const bool incremental=true)
Train the Naive Bayes classifier on the given dataset.
Definition: naive_bayes_classifier_impl.hpp:78
NaiveBayesClassifier(const MatType &data, const arma::Row< size_t > &labels, const size_t numClasses, const bool incrementalVariance=false, const double epsilon=1e-10)
Initializes the classifier as per the input and then trains it by calculating the sample mean and var...
Definition: naive_bayes_classifier_impl.hpp:30
The core includes that mlpack expects; standard C++ includes and Armadillo.
The simple Naive Bayes classifier.
Definition: naive_bayes_classifier.hpp:58
size_t Classify(const VecType &point) const
Classify the given point, using the trained NaiveBayesClassifier model.
Definition: naive_bayes_classifier_impl.hpp:235