mlpack
|
#include <mlpack/prereqs.hpp>
#include <mlpack/core/util/io.hpp>
#include <mlpack/core/util/mlpack_main.hpp>
#include <mlpack/methods/hoeffding_trees/hoeffding_tree.hpp>
#include <mlpack/methods/hoeffding_trees/binary_numeric_split.hpp>
#include <mlpack/methods/hoeffding_trees/information_gain.hpp>
#include <mlpack/methods/hoeffding_trees/hoeffding_tree_model.hpp>
#include <queue>
Typedefs | |
typedef tuple< DatasetInfo, arma::mat > | TupleType |
Functions | |
BINDING_NAME ("Hoeffding trees") | |
BINDING_SHORT_DESC ("An implementation of Hoeffding trees, a form of streaming decision tree " "for classification. Given labeled data, a Hoeffding tree can be trained " "and saved for later use, or a pre-trained Hoeffding tree can be used for " "predicting the classifications of new points.") | |
BINDING_LONG_DESC ("This program implements Hoeffding trees, a form of streaming decision tree" " suited best for large (or streaming) datasets. This program supports " "both categorical and numeric data. Given an input dataset, this program " "is able to train the tree with numerous training options, and save the " "model to a file. The program is also able to use a trained model or a " "model from file in order to predict classes for a given test set." "\" "The training file and associated labels are specified with the "+PRINT_PARAM_STRING("training")+" and "+PRINT_PARAM_STRING("labels")+" parameters, respectively. Optionally, if "+PRINT_PARAM_STRING("labels")+" is not specified, the labels are assumed " "to be the last dimension of the training dataset." "\" "The training may be performed in batch mode " "(like a typical decision tree algorithm) by specifying the "+PRINT_PARAM_STRING("batch_mode")+" option, but this may not be the best " "option for large datasets." "\" "When a model is trained, it may be saved via the "+PRINT_PARAM_STRING("output_model")+" output parameter. A model may be " "loaded from file for further training or testing with the "+PRINT_PARAM_STRING("input_model")+" parameter." "\" "Test data may be specified with the "+PRINT_PARAM_STRING("test")+" " "parameter, and if performance statistics are desired for that test set, " "labels may be specified with the "+PRINT_PARAM_STRING("test_labels")+" parameter. Predictions for each test point may be saved with the "+PRINT_PARAM_STRING("predictions")+" output parameter, and class " "probabilities for each prediction may be saved with the "+PRINT_PARAM_STRING("probabilities")+" output parameter.") | |
BINDING_EXAMPLE ("For example, to train a Hoeffding tree with confidence 0.99 with data "+PRINT_DATASET("dataset")+", saving the trained tree to "+PRINT_MODEL("tree")+", the following command may be used:" "\"+PRINT_CALL("hoeffding_tree", "training", "dataset", "confidence", 0.99, "output_model", "tree")+"\" "Then, this tree may be used to make predictions on the test set "+PRINT_DATASET("test_set")+", saving the predictions into "+PRINT_DATASET("predictions")+" and the class probabilities into "+PRINT_DATASET("class_probs")+" with the following command: " "\"+PRINT_CALL("hoeffding_tree", "input_model", "tree", "test", "test_set", "predictions", "predictions", "probabilities", "class_probs")) | |
BINDING_SEE_ALSO ("@decision_tree", "#decision_tree") | |
BINDING_SEE_ALSO ("@random_forest", "#random_forest") | |
BINDING_SEE_ALSO ("Mining High-Speed Data Streams (pdf)", "http://dm.cs.washington.edu/papers/vfdt-kdd00.pdf") | |
BINDING_SEE_ALSO ("mlpack::tree::HoeffdingTree class documentation", "@doxygen/classmlpack_1_1tree_1_1HoeffdingTree.html") | |
PARAM_MATRIX_AND_INFO_IN ("training", "Training dataset (may be categorical).", "t") | |
PARAM_UROW_IN ("labels", "Labels for training dataset.", "l") | |
PARAM_DOUBLE_IN ("confidence", "Confidence before splitting (between 0 and 1).", "c", 0.95) | |
PARAM_INT_IN ("max_samples", "Maximum number of samples before splitting.", "n", 5000) | |
PARAM_INT_IN ("min_samples", "Minimum number of samples before splitting.", "I", 100) | |
PARAM_MODEL_IN (HoeffdingTreeModel, "input_model", "Input trained Hoeffding tree" " model.", "m") | |
PARAM_MODEL_OUT (HoeffdingTreeModel, "output_model", "Output for trained " "Hoeffding tree model.", "M") | |
PARAM_MATRIX_AND_INFO_IN ("test", "Testing dataset (may be categorical).", "T") | |
PARAM_UROW_IN ("test_labels", "Labels of test data.", "L") | |
PARAM_UROW_OUT ("predictions", "Matrix to output label predictions for test " "data into.", "p") | |
PARAM_MATRIX_OUT ("probabilities", "In addition to predicting labels, provide " "rediction probabilities in this matrix.", "P") | |
PARAM_STRING_IN ("numeric_split_strategy", "The splitting strategy to use for " "numeric features: 'domingos' or 'binary'.", "N", "binary") | |
PARAM_FLAG ("batch_mode", "If true, samples will be considered in batch instead " "of as a stream. This generally results in better trees but at the cost of" " memory usage and runtime.", "b") | |
PARAM_FLAG ("info_gain", "If set, information gain is used instead of Gini " "impurity for calculating Hoeffding bounds.", "i") | |
PARAM_INT_IN ("passes", "Number of passes to take over the dataset.", "s", 1) | |
PARAM_INT_IN ("bins", "If the 'domingos' split strategy is used, this specifies " "the number of bins for each numeric split.", "B", 10) | |
PARAM_INT_IN ("observations_before_binning", "If the 'domingos' split strategy " "is used, this specifies the number of samples observed before binning is " "performed.", "o", 100) | |
A command-line executable that can build a streaming decision tree.
mlpack is free software; you may redistribute it and/or modify it under the terms of the 3-clause BSD license. You should have received a copy of the 3-clause BSD license along with mlpack. If not, see http://www.opensource.org/licenses/BSD-3-Clause for more information.