mlpack
hoeffding_tree_impl.hpp
Go to the documentation of this file.
1 
12 #ifndef MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_IMPL_HPP
13 #define MLPACK_METHODS_HOEFFDING_TREES_HOEFFDING_TREE_IMPL_HPP
14 
15 // In case it hasn't been included yet.
16 #include "hoeffding_tree.hpp"
17 #include <stack>
18 
19 namespace mlpack {
20 namespace tree {
21 
22 template<typename FitnessFunction,
23  template<typename> class NumericSplitType,
24  template<typename> class CategoricalSplitType>
25 template<typename MatType>
26 HoeffdingTree<
27  FitnessFunction,
28  NumericSplitType,
29  CategoricalSplitType
30 >::HoeffdingTree(const MatType& data,
31  const data::DatasetInfo& datasetInfoIn,
32  const arma::Row<size_t>& labels,
33  const size_t numClasses,
34  const bool batchTraining,
35  const double successProbability,
36  const size_t maxSamples,
37  const size_t checkInterval,
38  const size_t minSamples,
39  const CategoricalSplitType<FitnessFunction>&
40  categoricalSplitIn,
41  const NumericSplitType<FitnessFunction>& numericSplitIn) :
42  dimensionMappings(NULL),
43  ownsMappings(false),
44  numSamples(0),
45  numClasses(numClasses),
46  maxSamples((maxSamples == 0) ? size_t(-1) : maxSamples),
47  checkInterval(checkInterval),
48  minSamples(minSamples),
49  datasetInfo(new data::DatasetInfo(datasetInfoIn)),
50  ownsInfo(true),
51  successProbability(successProbability),
52  splitDimension(size_t(-1)),
53  majorityClass(0),
54  majorityProbability(0.0),
55  categoricalSplit(0),
56  numericSplit()
57 {
58  // Reset the tree.
59  ResetTree(categoricalSplitIn, numericSplitIn);
60 
61  // Now train.
62  Train(data, labels, batchTraining);
63 }
64 
65 template<typename FitnessFunction,
66  template<typename> class NumericSplitType,
67  template<typename> class CategoricalSplitType>
69  FitnessFunction,
70  NumericSplitType,
71  CategoricalSplitType
72 >::HoeffdingTree(const data::DatasetInfo& datasetInfo,
73  const size_t numClasses,
74  const double successProbability,
75  const size_t maxSamples,
76  const size_t checkInterval,
77  const size_t minSamples,
78  const CategoricalSplitType<FitnessFunction>&
79  categoricalSplitIn,
80  const NumericSplitType<FitnessFunction>& numericSplitIn,
81  std::unordered_map<size_t, std::pair<size_t, size_t>>*
82  dimensionMappingsIn,
83  const bool copyDatasetInfo) :
84  dimensionMappings((dimensionMappingsIn != NULL) ? dimensionMappingsIn :
85  new std::unordered_map<size_t, std::pair<size_t, size_t>>()),
86  ownsMappings(dimensionMappingsIn == NULL),
87  numSamples(0),
88  numClasses(numClasses),
89  maxSamples((maxSamples == 0) ? size_t(-1) : maxSamples),
90  checkInterval(checkInterval),
91  minSamples(minSamples),
92  datasetInfo(copyDatasetInfo ? new data::DatasetInfo(datasetInfo) :
93  &datasetInfo),
94  ownsInfo(copyDatasetInfo),
95  successProbability(successProbability),
96  splitDimension(size_t(-1)),
97  majorityClass(0),
98  majorityProbability(0.0),
99  categoricalSplit(0),
100  numericSplit()
101 {
102  // Do we need to generate the mappings too?
103  if (ownsMappings)
104  {
105  ResetTree(categoricalSplitIn, numericSplitIn);
106  }
107  else
108  {
109  for (size_t i = 0; i < datasetInfo.Dimensionality(); ++i)
110  {
111  if (datasetInfo.Type(i) == data::Datatype::categorical)
112  {
113  categoricalSplits.push_back(CategoricalSplitType<FitnessFunction>(
114  datasetInfo.NumMappings(i), numClasses, categoricalSplitIn));
115  }
116  else
117  {
118  numericSplits.push_back(NumericSplitType<FitnessFunction>(numClasses,
119  numericSplitIn));
120  }
121  }
122  }
123 }
124 
125 template<typename FitnessFunction,
126  template<typename> class NumericSplitType,
127  template<typename> class CategoricalSplitType>
129  FitnessFunction,
130  NumericSplitType,
131  CategoricalSplitType
133  dimensionMappings(
134  new std::unordered_map<size_t, std::pair<size_t, size_t>>()),
135  ownsMappings(true),
136  numSamples(0),
137  numClasses(0),
138  maxSamples(size_t(-1)),
139  checkInterval(100),
140  minSamples(100),
141  datasetInfo(new data::DatasetInfo()),
142  ownsInfo(true),
143  successProbability(0.95),
144  splitDimension(size_t(-1)),
145  majorityClass(0),
146  majorityProbability(0.0),
147  categoricalSplit(0),
148  numericSplit()
149 {
150  // Nothing to do.
151 }
152 
153 // Copy constructor.
154 template<typename FitnessFunction,
155  template<typename> class NumericSplitType,
156  template<typename> class CategoricalSplitType>
159  numericSplits(other.numericSplits),
160  categoricalSplits(other.categoricalSplits),
161  dimensionMappings(new std::unordered_map<size_t,
162  std::pair<size_t, size_t>>(*other.dimensionMappings)),
163  ownsMappings(true),
164  numSamples(other.numSamples),
165  numClasses(other.numClasses),
166  maxSamples(other.maxSamples),
167  checkInterval(other.checkInterval),
168  minSamples(other.minSamples),
169  datasetInfo(new data::DatasetInfo(*other.datasetInfo)),
170  ownsInfo(true),
171  successProbability(other.successProbability),
172  splitDimension(other.splitDimension),
173  majorityClass(other.majorityClass),
174  majorityProbability(other.majorityProbability),
175  categoricalSplit(other.categoricalSplit),
176  numericSplit(other.numericSplit)
177 {
178  // Copy each of the children.
179  for (size_t i = 0; i < other.children.size(); ++i)
180  {
181  children.push_back(new HoeffdingTree(*other.children[i]));
182 
183  // Delete copied datasetInfo and dimension mappings.
184  delete children[i]->datasetInfo;
185  children[i]->datasetInfo = this->datasetInfo;
186  children[i]->ownsInfo = false;
187 
188  delete children[i]->dimensionMappings;
189  children[i]->dimensionMappings = this->dimensionMappings;
190  children[i]->ownsMappings = false;
191  }
192 }
193 
194 // Move constructor.
195 template<typename FitnessFunction,
196  template<typename> class NumericSplitType,
197  template<typename> class CategoricalSplitType>
200  numericSplits(std::move(other.numericSplits)),
201  categoricalSplits(std::move(other.categoricalSplits)),
202  dimensionMappings(other.dimensionMappings),
203  ownsMappings(true),
204  numSamples(other.numSamples),
205  numClasses(other.numClasses),
206  maxSamples(other.maxSamples),
207  checkInterval(other.checkInterval),
208  minSamples(other.minSamples),
209  datasetInfo(other.datasetInfo),
210  ownsInfo(true),
211  successProbability(other.successProbability),
212  splitDimension(other.splitDimension),
213  majorityClass(other.majorityClass),
214  majorityProbability(other.majorityProbability),
215  categoricalSplit(std::move(other.categoricalSplit)),
216  numericSplit(std::move(other.numericSplit))
217 {
218  // Remove pointers.
219  other.dimensionMappings = nullptr;
220  other.datasetInfo = nullptr;
221 
222  // Reset primary type variables.
223  other.numSamples = 0;
224  other.numClasses = 0;
225  other.checkInterval = 0;
226  other.minSamples = 0;
227  other.successProbability = 0.0;
228  other.splitDimension = 0;
229  other.majorityClass = 0;
230  other.majorityProbability = 0.0;
231 }
232 
233 // Copy assignment operator.
234 template<typename FitnessFunction,
235  template<typename> class NumericSplitType,
236  template<typename> class CategoricalSplitType>
240 {
241  if (this != &other)
242  {
243  numericSplits = other.numericSplits;
244  categoricalSplits = other.categoricalSplits;
245  dimensionMappings = new std::unordered_map<size_t,
246  std::pair<size_t, size_t>>(*other.dimensionMappings);
247  ownsMappings = true;
248  numSamples = other.numSamples;
249  numClasses = other.numClasses;
250  maxSamples = other.maxSamples;
251  checkInterval = other.checkInterval;
252  minSamples = other.minSamples;
253  datasetInfo = new data::DatasetInfo(*other.datasetInfo);
254  ownsInfo = true;
255  successProbability = other.successProbability;
256  splitDimension = other.splitDimension;
257  majorityClass = other.majorityClass;
258  majorityProbability = other.majorityProbability;
259  categoricalSplit = other.categoricalSplit;
260  numericSplit = other.numericSplit;
261 
262  // Copy each of the children.
263  for (size_t i = 0; i < other.children.size(); ++i)
264  {
265  children.push_back(new HoeffdingTree(*other.children[i]));
266 
267  // Delete copied datasetInfo and dimension mappings.
268  delete children[i]->datasetInfo;
269  children[i]->datasetInfo = this->datasetInfo;
270  children[i]->ownsInfo = false;
271 
272  delete children[i]->dimensionMappings;
273  children[i]->dimensionMappings = this->dimensionMappings;
274  children[i]->ownsMappings = false;
275  }
276  }
277  return *this;
278 }
279 
280 // Move assignment operator.
281 template<typename FitnessFunction,
282  template<typename> class NumericSplitType,
283  template<typename> class CategoricalSplitType>
287 {
288  if (this != &other)
289  {
290  numericSplits = std::move(other.numericSplits);
291  categoricalSplits = std::move(other.categoricalSplits);
292  dimensionMappings = other.dimensionMappings;
293  ownsMappings = true;
294  numSamples = other.numSamples;
295  numClasses = other.numClasses;
296  maxSamples = other.maxSamples;
297  checkInterval = other.checkInterval;
298  minSamples = other.minSamples;
299  datasetInfo = other.datasetInfo;
300  ownsInfo = true;
301  successProbability = other.successProbability;
302  splitDimension = other.splitDimension;
303  majorityClass = other.majorityClass;
304  majorityProbability = other.majorityProbability;
305  categoricalSplit = std::move(other.categoricalSplit);
306  numericSplit = std::move(other.numericSplit);
307 
308  // Remove pointers.
309  other.dimensionMappings = nullptr;
310  other.datasetInfo = nullptr;
311 
312  // Reset primary type variables.
313  other.numSamples = 0;
314  other.numClasses = 0;
315  other.checkInterval = 0;
316  other.minSamples = 0;
317  other.successProbability = 0.0;
318  other.splitDimension = 0;
319  other.majorityClass = 0;
320  other.majorityProbability = 0.0;
321  }
322  return *this;
323 }
324 
325 
326 template<typename FitnessFunction,
327  template<typename> class NumericSplitType,
328  template<typename> class CategoricalSplitType>
331 {
332  if (ownsMappings)
333  delete dimensionMappings;
334  if (ownsInfo)
335  delete datasetInfo;
336  for (size_t i = 0; i < children.size(); ++i)
337  delete children[i];
338 }
339 
341 template<typename FitnessFunction,
342  template<typename> class NumericSplitType,
343  template<typename> class CategoricalSplitType>
344 template<typename MatType>
345 void HoeffdingTree<
346  FitnessFunction,
347  NumericSplitType,
348  CategoricalSplitType
349 >::Train(const MatType& data,
350  const arma::Row<size_t>& labels,
351  const bool batchTraining,
352  const bool resetTree,
353  const size_t numClassesIn)
354 {
355  // We need to reset the tree either if the user asked for it, or if they
356  // passed data whose dimensionality is different than our datasetInfo object.
357  if (resetTree || data.n_rows != datasetInfo->Dimensionality() ||
358  numClassesIn != 0)
359  {
360  // Create a new datasetInfo, which assumes that all features are numeric.
361  if (ownsInfo)
362  delete datasetInfo;
363  datasetInfo = new data::DatasetInfo(data.n_rows);
364  ownsInfo = true;
365 
366  // Set the number of classes correctly.
367  numClasses = (numClassesIn != 0) ? numClassesIn : arma::max(labels) + 1;
368 
369  ResetTree();
370  }
371 
372  TrainInternal(data, labels, batchTraining);
373 }
374 
376 template<typename FitnessFunction,
377  template<typename> class NumericSplitType,
378  template<typename> class CategoricalSplitType>
379 template<typename MatType>
380 void HoeffdingTree<
381  FitnessFunction,
382  NumericSplitType,
383  CategoricalSplitType
384 >::Train(const MatType& data,
385  const data::DatasetInfo& info,
386  const arma::Row<size_t>& labels,
387  const bool batchTraining,
388  const size_t numClassesIn)
389 {
390  // Take over new DatasetInfo.
391  if (ownsInfo)
392  delete datasetInfo;
393  datasetInfo = &info;
394  ownsInfo = false;
395 
396  // Set the number of classes correctly.
397  numClasses = (numClassesIn != 0) ? numClassesIn : arma::max(labels) + 1;
398 
399  ResetTree();
400 
401  // Now train.
402  TrainInternal(data, labels, batchTraining);
403 }
404 
406 template<typename FitnessFunction,
407  template<typename> class NumericSplitType,
408  template<typename> class CategoricalSplitType>
409 template<typename VecType>
410 void HoeffdingTree<
411  FitnessFunction,
412  NumericSplitType,
413  CategoricalSplitType
414 >::Train(const VecType& point, const size_t label)
415 {
416  if (splitDimension == size_t(-1))
417  {
418  ++numSamples;
419  size_t numericIndex = 0;
420  size_t categoricalIndex = 0;
421  for (size_t i = 0; i < point.n_rows; ++i)
422  {
423  if (datasetInfo->Type(i) == data::Datatype::categorical)
424  categoricalSplits[categoricalIndex++].Train(point[i], label);
425  else if (datasetInfo->Type(i) == data::Datatype::numeric)
426  numericSplits[numericIndex++].Train(point[i], label);
427  }
428 
429  // Grab majority class from splits.
430  if (categoricalSplits.size() > 0)
431  {
432  majorityClass = categoricalSplits[0].MajorityClass();
433  majorityProbability = categoricalSplits[0].MajorityProbability();
434  }
435  else
436  {
437  majorityClass = numericSplits[0].MajorityClass();
438  majorityProbability = numericSplits[0].MajorityProbability();
439  }
440 
441  // Check for a split, if we should.
442  if (numSamples % checkInterval == 0)
443  {
444  const size_t numChildren = SplitCheck();
445  if (numChildren > 0)
446  {
447  // We need to add a bunch of children.
448  // Delete children, if we have them.
449  children.clear();
450  CreateChildren();
451  }
452  }
453  }
454  else
455  {
456  // Already split. Pass the training point to the relevant child.
457  size_t direction = CalculateDirection(point);
458  children[direction]->Train(point, label);
459  }
460 }
461 
462 template<typename FitnessFunction,
463  template<typename> class NumericSplitType,
464  template<typename> class CategoricalSplitType>
465 size_t HoeffdingTree<
466  FitnessFunction,
467  NumericSplitType,
468  CategoricalSplitType
470 {
471  // Do nothing if we've already split.
472  if (splitDimension != size_t(-1))
473  return 0;
474 
475  // If not enough points have been seen, we cannot split.
476  if (numSamples <= minSamples)
477  return 0;
478 
479  // Check the fitness of each dimension. Then we'll use a Hoeffding bound
480  // somehow.
481 
482  // Calculate epsilon, the value we need things to be greater than.
483  const double rSquared = std::pow(FitnessFunction::Range(numClasses), 2.0);
484  const double epsilon = std::sqrt(rSquared *
485  std::log(1.0 / (1.0 - successProbability)) / (2 * numSamples));
486 
487  // Find the best and second best possible splits.
488  double largest = -DBL_MAX;
489  size_t largestIndex = 0;
490  double secondLargest = -DBL_MAX;
491  for (size_t i = 0; i < categoricalSplits.size() + numericSplits.size(); ++i)
492  {
493  size_t type = dimensionMappings->at(i).first;
494  size_t index = dimensionMappings->at(i).second;
495 
496  // Some split procedures can split multiple ways, but we only care about the
497  // best two splits that can be done in every network.
498  double bestGain = 0.0;
499  double secondBestGain = 0.0;
500  if (type == data::Datatype::categorical)
501  categoricalSplits[index].EvaluateFitnessFunction(bestGain,
502  secondBestGain);
503  else if (type == data::Datatype::numeric)
504  numericSplits[index].EvaluateFitnessFunction(bestGain, secondBestGain);
505 
506  // See if these gains are better than the previous.
507  if (bestGain > largest)
508  {
509  secondLargest = largest;
510  largest = bestGain;
511  largestIndex = i;
512  }
513  else if (bestGain > secondLargest)
514  {
515  secondLargest = bestGain;
516  }
517 
518  if (secondBestGain > secondLargest)
519  {
520  secondLargest = secondBestGain;
521  }
522  }
523 
524  // Are these far enough apart to split?
525  if ((largest > 0.0) &&
526  ((largest - secondLargest > epsilon) || (numSamples > maxSamples) ||
527  (epsilon <= 0.05)))
528  {
529  // Split!
530  splitDimension = largestIndex;
531  const size_t type = dimensionMappings->at(largestIndex).first;
532  const size_t index = dimensionMappings->at(largestIndex).second;
533  if (type == data::Datatype::categorical)
534  {
535  // I don't know if this should be here.
536  majorityClass = categoricalSplits[index].MajorityClass();
537  return categoricalSplits[index].NumChildren();
538  }
539  else
540  {
541  majorityClass = numericSplits[index].MajorityClass();
542  return numericSplits[index].NumChildren();
543  }
544  }
545  else
546  {
547  return 0; // Don't split.
548  }
549 }
550 
551 template<
552  typename FitnessFunction,
553  template<typename> class NumericSplitType,
554  template<typename> class CategoricalSplitType
555 >
556 void HoeffdingTree<
557  FitnessFunction,
558  NumericSplitType,
559  CategoricalSplitType
560 >::SuccessProbability(const double successProbability)
561 {
562  this->successProbability = successProbability;
563  for (size_t i = 0; i < children.size(); ++i)
564  children[i]->SuccessProbability(successProbability);
565 }
566 
567 template<
568  typename FitnessFunction,
569  template<typename> class NumericSplitType,
570  template<typename> class CategoricalSplitType
571 >
572 void HoeffdingTree<
573  FitnessFunction,
574  NumericSplitType,
575  CategoricalSplitType
576 >::MinSamples(const size_t minSamples)
577 {
578  this->minSamples = minSamples;
579  for (size_t i = 0; i < children.size(); ++i)
580  children[i]->MinSamples(minSamples);
581 }
582 
583 template<
584  typename FitnessFunction,
585  template<typename> class NumericSplitType,
586  template<typename> class CategoricalSplitType
587 >
588 void HoeffdingTree<
589  FitnessFunction,
590  NumericSplitType,
591  CategoricalSplitType
592 >::MaxSamples(const size_t maxSamples)
593 {
594  this->maxSamples = maxSamples;
595  for (size_t i = 0; i < children.size(); ++i)
596  children[i]->MaxSamples(maxSamples);
597 }
598 
599 template<
600  typename FitnessFunction,
601  template<typename> class NumericSplitType,
602  template<typename> class CategoricalSplitType
603 >
604 void HoeffdingTree<
605  FitnessFunction,
606  NumericSplitType,
607  CategoricalSplitType
608 >::CheckInterval(const size_t checkInterval)
609 {
610  this->checkInterval = checkInterval;
611  for (size_t i = 0; i < children.size(); ++i)
612  children[i]->CheckInterval(checkInterval);
613 }
614 
615 template<
616  typename FitnessFunction,
617  template<typename> class NumericSplitType,
618  template<typename> class CategoricalSplitType
619 >
620 template<typename VecType>
621 size_t HoeffdingTree<
622  FitnessFunction,
623  NumericSplitType,
624  CategoricalSplitType
625 >::CalculateDirection(const VecType& point) const
626 {
627  // Don't call this before the node is split...
628  if (datasetInfo->Type(splitDimension) == data::Datatype::numeric)
629  return numericSplit.CalculateDirection(point[splitDimension]);
630  else if (datasetInfo->Type(splitDimension) == data::Datatype::categorical)
631  return categoricalSplit.CalculateDirection(point[splitDimension]);
632  else
633  return 0; // Not sure what to do here...
634 }
635 
636 template<typename FitnessFunction,
637  template<typename> class NumericSplitType,
638  template<typename> class CategoricalSplitType>
639 size_t HoeffdingTree<
640  FitnessFunction,
641  NumericSplitType,
642  CategoricalSplitType
644 {
645  size_t nodes = 0;
646  std::stack<const HoeffdingTree*> stack;
647  stack.push(this); // Push the current tree.
648  while (!stack.empty())
649  {
650  const HoeffdingTree* node = stack.top();
651  stack.pop();
652  nodes += node->NumChildren();
653  for (size_t i = 0; i < node->NumChildren(); ++i)
654  stack.push(&node->Child(i));
655  }
656  return nodes;
657 }
658 
659 template<
660  typename FitnessFunction,
661  template<typename> class NumericSplitType,
662  template<typename> class CategoricalSplitType
663 >
664 template<typename VecType>
665 size_t HoeffdingTree<
666  FitnessFunction,
667  NumericSplitType,
668  CategoricalSplitType
669 >::Classify(const VecType& point) const
670 {
671  if (children.size() == 0)
672  {
673  // If we're a leaf (or being considered a leaf), classify based on what we
674  // know.
675  return majorityClass;
676  }
677  else
678  {
679  // Otherwise, pass to the right child and let them classify.
680  return children[CalculateDirection(point)]->Classify(point);
681  }
682 }
683 
684 template<
685  typename FitnessFunction,
686  template<typename> class NumericSplitType,
687  template<typename> class CategoricalSplitType
688 >
689 template<typename VecType>
690 void HoeffdingTree<
691  FitnessFunction,
692  NumericSplitType,
693  CategoricalSplitType
694 >::Classify(const VecType& point,
695  size_t& prediction,
696  double& probability) const
697 {
698  if (children.size() == 0)
699  {
700  // We are a leaf, so classify accordingly.
701  prediction = majorityClass;
702  probability = majorityProbability;
703  }
704  else
705  {
706  // Pass to the right child and let them do the classification.
707  children[CalculateDirection(point)]->Classify(point, prediction,
708  probability);
709  }
710 }
711 
713 template<
714  typename FitnessFunction,
715  template<typename> class NumericSplitType,
716  template<typename> class CategoricalSplitType
717 >
718 template<typename MatType>
719 void HoeffdingTree<
720  FitnessFunction,
721  NumericSplitType,
722  CategoricalSplitType
723 >::Classify(const MatType& data, arma::Row<size_t>& predictions) const
724 {
725  predictions.set_size(data.n_cols);
726  for (size_t i = 0; i < data.n_cols; ++i)
727  predictions[i] = Classify(data.col(i));
728 }
729 
731 template<
732  typename FitnessFunction,
733  template<typename> class NumericSplitType,
734  template<typename> class CategoricalSplitType
735 >
736 template<typename MatType>
737 void HoeffdingTree<
738  FitnessFunction,
739  NumericSplitType,
740  CategoricalSplitType
741 >::Classify(const MatType& data,
742  arma::Row<size_t>& predictions,
743  arma::rowvec& probabilities) const
744 {
745  predictions.set_size(data.n_cols);
746  probabilities.set_size(data.n_cols);
747  for (size_t i = 0; i < data.n_cols; ++i)
748  Classify(data.col(i), predictions[i], probabilities[i]);
749 }
750 
751 template<
752  typename FitnessFunction,
753  template<typename> class NumericSplitType,
754  template<typename> class CategoricalSplitType
755 >
756 void HoeffdingTree<
757  FitnessFunction,
758  NumericSplitType,
759  CategoricalSplitType
761 {
762  // Create the children.
763  arma::Col<size_t> childMajorities;
764  if (dimensionMappings->at(splitDimension).first ==
765  data::Datatype::categorical)
766  {
767  categoricalSplits[dimensionMappings->at(splitDimension).second].Split(
768  childMajorities, categoricalSplit);
769  }
770  else if (dimensionMappings->at(splitDimension).first ==
771  data::Datatype::numeric)
772  {
773  numericSplits[dimensionMappings->at(splitDimension).second].Split(
774  childMajorities, numericSplit);
775  }
776 
777  // We already know what the splitDimension will be.
778  for (size_t i = 0; i < childMajorities.n_elem; ++i)
779  {
780  // We need to also give our split objects to the new children, so that
781  // parameters for the splits can be passed down. But if we have no
782  // categorical or numeric features, we can't pass anything but the
783  // defaults...
784  if (categoricalSplits.size() == 0)
785  {
786  // Pass a default categorical split.
787  children.push_back(new HoeffdingTree(*datasetInfo, numClasses,
788  successProbability, maxSamples, checkInterval, minSamples,
789  CategoricalSplitType<FitnessFunction>(0, numClasses),
790  numericSplits[0], dimensionMappings, false));
791  }
792  else if (numericSplits.size() == 0)
793  {
794  // Pass a default numeric split.
795  children.push_back(new HoeffdingTree(*datasetInfo, numClasses,
796  successProbability, maxSamples, checkInterval, minSamples,
797  categoricalSplits[0], NumericSplitType<FitnessFunction>(numClasses),
798  dimensionMappings, false));
799  }
800  else
801  {
802  // Pass both splits that we already have.
803  children.push_back(new HoeffdingTree(*datasetInfo, numClasses,
804  successProbability, maxSamples, checkInterval, minSamples,
805  categoricalSplits[0], numericSplits[0], dimensionMappings, false));
806  }
807 
808  children[i]->MajorityClass() = childMajorities[i];
809  }
810 
811  // Eliminate now-unnecessary split information.
812  numericSplits.clear();
813  categoricalSplits.clear();
814 }
815 
816 template<
817  typename FitnessFunction,
818  template<typename> class NumericSplitType,
819  template<typename> class CategoricalSplitType
820 >
821 template<typename Archive>
822 void HoeffdingTree<
823  FitnessFunction,
824  NumericSplitType,
825  CategoricalSplitType
826 >::serialize(Archive& ar, const uint32_t /* version */)
827 {
828  ar(CEREAL_NVP(splitDimension));
829 
830  // Clear memory for the mappings if necessary.
831  if (cereal::is_loading<Archive>() && ownsMappings && dimensionMappings)
832  delete dimensionMappings;
833 
834  ar(CEREAL_POINTER(dimensionMappings));
835 
836  // Special handling for const object.
837  data::DatasetInfo* d = NULL;
838  if (cereal::is_saving<Archive>())
839  d = const_cast<data::DatasetInfo*>(datasetInfo);
840  ar(CEREAL_POINTER(d));
841 
842  if (cereal::is_loading<Archive>())
843  {
844  if (datasetInfo && ownsInfo)
845  delete datasetInfo;
846 
847  datasetInfo = d;
848  ownsInfo = true;
849  ownsMappings = true; // We also own the mappings we loaded.
850 
851  // Clear the children.
852  for (size_t i = 0; i < children.size(); ++i)
853  delete children[i];
854  children.clear();
855  }
856 
857  ar(CEREAL_NVP(majorityClass));
858  ar(CEREAL_NVP(majorityProbability));
859 
860  // Depending on whether or not we have split yet, we may need to save
861  // different things.
862  if (splitDimension == size_t(-1))
863  {
864  // We have not yet split. So we have to serialize the splits.
865  ar(CEREAL_NVP(numSamples));
866  ar(CEREAL_NVP(numClasses));
867  ar(CEREAL_NVP(maxSamples));
868  ar(CEREAL_NVP(successProbability));
869 
870  // Serialize the splits, but not if we haven't seen any samples yet (in
871  // which case we can just reinitialize).
872  if (cereal::is_loading<Archive>())
873  {
874  // Re-initialize all of the splits.
875  numericSplits.clear();
876  categoricalSplits.clear();
877  for (size_t i = 0; i < datasetInfo->Dimensionality(); ++i)
878  {
879  if (datasetInfo->Type(i) == data::Datatype::categorical)
880  categoricalSplits.push_back(CategoricalSplitType<FitnessFunction>(
881  datasetInfo->NumMappings(i), numClasses));
882  else
883  numericSplits.push_back(
884  NumericSplitType<FitnessFunction>(numClasses));
885  }
886 
887  // Clear things we don't need.
888  categoricalSplit = typename CategoricalSplitType<FitnessFunction>::
889  SplitInfo(numClasses);
890  numericSplit = typename NumericSplitType<FitnessFunction>::SplitInfo();
891  }
892 
893  // There's no need to serialize if there's no information contained in the
894  // splits.
895  if (numSamples == 0)
896  return;
897 
898  // Serialize numeric splits.
899  ar(CEREAL_NVP(numericSplits));
900 
901  // Serialize categorical splits.
902  ar(CEREAL_NVP(categoricalSplits));
903  }
904  else
905  {
906  // We have split, so we only need to save the split and the children.
907  if (datasetInfo->Type(splitDimension) == data::Datatype::categorical)
908  ar(CEREAL_NVP(categoricalSplit));
909  else
910  ar(CEREAL_NVP(numericSplit));
911 
912  // Serialize the children, because we have split.
913  ar(CEREAL_VECTOR_POINTER(children));
914 
915  if (cereal::is_loading<Archive>())
916  {
917  for (size_t i = 0; i < children.size(); ++i)
918  {
919  // The child doesn't actually own its own DatasetInfo. We do. The same
920  // applies for the dimension mappings.
921  if (children[i]->datasetInfo == datasetInfo)
922  children[i]->ownsInfo = false;
923  children[i]->ownsMappings = false;
924  }
925 
926  numericSplits.clear();
927  categoricalSplits.clear();
928 
929  numSamples = 0;
930  numClasses = 0;
931  maxSamples = 0;
932  successProbability = 0.0;
933  }
934  }
935 }
936 
937 template<
938  typename FitnessFunction,
939  template<typename> class NumericSplitType,
940  template<typename> class CategoricalSplitType
941 >
942 template<typename MatType>
943 void HoeffdingTree<
944  FitnessFunction,
945  NumericSplitType,
946  CategoricalSplitType
947 >::TrainInternal(const MatType& data,
948  const arma::Row<size_t>& labels,
949  const bool batchTraining)
950 {
951  if (batchTraining)
952  {
953  // Pass all the points through the nodes, and then split only after that.
954  checkInterval = data.n_cols; // Only split on the last sample.
955  // Don't split if there are fewer than five points.
956  size_t oldMaxSamples = maxSamples;
957  maxSamples = std::max(size_t(data.n_cols - 1), size_t(5));
958  for (size_t i = 0; i < data.n_cols; ++i)
959  Train(data.col(i), labels[i]);
960  maxSamples = oldMaxSamples;
961 
962  // Now, if we did split, find out which points go to which child, and
963  // perform the same batch training.
964  if (children.size() > 0)
965  {
966  // We need to create a vector of indices that represent the points that
967  // must go to each child, so we need children.size() vectors, but we don't
968  // know how long they will be. Therefore, we will create vectors each of
969  // size data.n_cols, but will probably not use all the memory we
970  // allocated, and then pass subvectors to the submat() function.
971  std::vector<arma::uvec> indices(children.size(), arma::uvec(data.n_cols));
972  arma::Col<size_t> counts =
973  arma::zeros<arma::Col<size_t>>(children.size());
974 
975  for (size_t i = 0; i < data.n_cols; ++i)
976  {
977  size_t direction = CalculateDirection(data.col(i));
978  size_t currentIndex = counts[direction];
979  indices[direction][currentIndex] = i;
980  counts[direction]++;
981  }
982 
983  // Now pass each of these submatrices to the children to perform
984  // batch-mode training.
985  for (size_t i = 0; i < children.size(); ++i)
986  {
987  // If we don't have any points that go to the child in question, don't
988  // train that child.
989  if (counts[i] == 0)
990  continue;
991 
992  // The submatrix here is non-contiguous, but I think this will be faster
993  // than copying the points to an ordered state. We still have to
994  // assemble the labels vector, though.
995  arma::Row<size_t> childLabels = labels.cols(
996  indices[i].subvec(0, counts[i] - 1));
997 
998  // Unfortunately, limitations of Armadillo's non-contiguous subviews
999  // prohibits us from successfully passing the non-contiguous subview to
1000  // Train(), since the col() function is not provided. So,
1001  // unfortunately, instead, we'll just extract the non-contiguous
1002  // submatrix.
1003  MatType childData = data.cols(indices[i].subvec(0, counts[i] - 1));
1004  children[i]->Train(childData, childLabels, true);
1005  }
1006  }
1007  }
1008  else
1009  {
1010  // We aren't training in batch mode; loop through the points.
1011  for (size_t i = 0; i < data.n_cols; ++i)
1012  Train(data.col(i), labels[i]);
1013  }
1014 }
1015 
1016 template<
1017  typename FitnessFunction,
1018  template<typename> class NumericSplitType,
1019  template<typename> class CategoricalSplitType
1020 >
1021 void HoeffdingTree<
1022  FitnessFunction,
1023  NumericSplitType,
1024  CategoricalSplitType
1025 >::ResetTree(const CategoricalSplitType<FitnessFunction>& categoricalSplitIn,
1026  const NumericSplitType<FitnessFunction>& numericSplitIn)
1027 {
1028  // Generate mappings.
1029  if (ownsMappings)
1030  delete dimensionMappings;
1031 
1032  categoricalSplits.clear();
1033  numericSplits.clear();
1034 
1035  dimensionMappings =
1036  new std::unordered_map<size_t, std::pair<size_t, size_t>>();
1037  ownsMappings = true;
1038  for (size_t i = 0; i < datasetInfo->Dimensionality(); ++i)
1039  {
1040  if (datasetInfo->Type(i) == data::Datatype::categorical)
1041  {
1042  categoricalSplits.push_back(CategoricalSplitType<FitnessFunction>(
1043  datasetInfo->NumMappings(i), numClasses, categoricalSplitIn));
1044  (*dimensionMappings)[i] = std::make_pair(data::Datatype::categorical,
1045  categoricalSplits.size() - 1);
1046  }
1047  else
1048  {
1049  numericSplits.push_back(NumericSplitType<FitnessFunction>(numClasses,
1050  numericSplitIn));
1051  (*dimensionMappings)[i] = std::make_pair(data::Datatype::numeric,
1052  numericSplits.size() - 1);
1053  }
1054  }
1055 
1056  // Clear children.
1057  for (size_t i = 0; i < children.size(); ++i)
1058  delete children[i];
1059  children.clear();
1060 
1061  // Reset statistics.
1062  numSamples = 0;
1063  splitDimension = size_t(-1);
1064  majorityClass = 0;
1065  majorityProbability = 0.0;
1066  categoricalSplit =
1067  typename CategoricalSplitType<FitnessFunction>::SplitInfo(0);
1068  numericSplit = typename NumericSplitType<FitnessFunction>::SplitInfo();
1069 }
1070 
1071 } // namespace tree
1072 } // namespace mlpack
1073 
1074 #endif
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
Definition: dataset_mapper.hpp:41
double SuccessProbability() const
Get the confidence required for a split.
Definition: hoeffding_tree.hpp:269
The HoeffdingTree object represents all of the necessary information for a Hoeffding-bound-based deci...
Definition: hoeffding_tree.hpp:61
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void Train(const MatType &data, const arma::Row< size_t > &labels, const bool batchTraining=true, const bool resetTree=false, const size_t numClasses=0)
Train on a set of points, either in streaming mode or in batch mode, with the given labels...
Definition: hoeffding_tree_impl.hpp:349
~HoeffdingTree()
Clean up memory.
Definition: hoeffding_tree_impl.hpp:330
Definition: pointer_wrapper.hpp:23
size_t CalculateDirection(const VecType &point) const
Given a point and that this node is not a leaf, calculate the index of the child node this point woul...
Definition: hoeffding_tree_impl.hpp:625
RangeType< double > Range
3.0.0 TODO: break reverse-compatibility by changing RangeType to Range.
Definition: range.hpp:19
size_t MinSamples() const
Get the minimum number of samples for a split.
Definition: hoeffding_tree.hpp:274
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
Definition: dataset_mapper_impl.hpp:228
size_t Classify(const VecType &point) const
Classify the given point, using this node and the entire (sub)tree beneath it.
Definition: hoeffding_tree_impl.hpp:669
HoeffdingTree & operator=(const HoeffdingTree &other)
Copy assignment operator.
Definition: hoeffding_tree_impl.hpp:239
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
Definition: dataset_mapper_impl.hpp:196
size_t NumDescendants() const
Get the size of the Hoeffding Tree.
Definition: hoeffding_tree_impl.hpp:643
#define CEREAL_VECTOR_POINTER(T)
Cereal does not support the serialization of raw pointer.
Definition: pointer_vector_wrapper.hpp:93
size_t CheckInterval() const
Get the number of samples before a split check is performed.
Definition: hoeffding_tree.hpp:284
HoeffdingTree()
Construct a Hoeffding tree with no data and no information.
Definition: hoeffding_tree_impl.hpp:132
size_t SplitCheck()
Check if a split would satisfy the conditions of the Hoeffding bound with the node&#39;s specified succes...
Definition: hoeffding_tree_impl.hpp:469
void serialize(Archive &ar, const uint32_t)
Serialize the split.
Definition: hoeffding_tree_impl.hpp:826
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
Definition: dataset_mapper_impl.hpp:222
#define CEREAL_POINTER(T)
Cereal does not support the serialization of raw pointer.
Definition: pointer_wrapper.hpp:96
size_t NumChildren() const
Get the number of children.
Definition: hoeffding_tree.hpp:261
size_t MaxSamples() const
Get the maximum number of samples before a split is forced.
Definition: hoeffding_tree.hpp:279
const HoeffdingTree & Child(const size_t i) const
Get a child.
Definition: hoeffding_tree.hpp:264
void CreateChildren()
Given that this node should split, create the children.
Definition: hoeffding_tree_impl.hpp:760