mlpack
Functions
preprocess_split_main.cpp File Reference
#include <mlpack/prereqs.hpp>
#include <mlpack/core/util/io.hpp>
#include <mlpack/core/util/mlpack_main.hpp>
#include <mlpack/core/math/random.hpp>
#include <mlpack/core/data/split_data.hpp>
Include dependency graph for preprocess_split_main.cpp:
This graph shows which files directly or indirectly include this file:

Functions

 BINDING_NAME ("Split Data")
 
 BINDING_SHORT_DESC ("A utility to split data into a training and testing dataset. This can " "also split labels according to the same split.")
 
 BINDING_LONG_DESC ("This utility takes a dataset and optionally labels and splits them into a " "training set and a test set. Before the split, the points in the dataset " "are randomly reordered. The percentage of the dataset to be used as the " "test set can be specified with the "+PRINT_PARAM_STRING("test_ratio")+" parameter; the default is 0.2 (20%)." "\" "The output training and test matrices may be saved with the "+PRINT_PARAM_STRING("training")+" and "+PRINT_PARAM_STRING("test")+" output parameters." "\" "Optionally, labels can also be split along with the data by specifying " "the "+PRINT_PARAM_STRING("input_labels")+" parameter. Splitting " "labels works the same way as splitting the data. The output training and " "test labels may be saved with the "+PRINT_PARAM_STRING("training_labels")+" and "+PRINT_PARAM_STRING("test_labels")+" output parameters, respectively.")
 
 BINDING_EXAMPLE ("So, a simple example where we want to split the dataset "+PRINT_DATASET("X")+" into "+PRINT_DATASET("X_train")+" and "+PRINT_DATASET("X_test")+" with 60% of the data in the training set and " "40% of the dataset in the test set, we could run " "\"+PRINT_CALL("preprocess_split", "input", "X", "training", "X_train", "test", "X_test", "test_ratio", 0.4)+"\" "Also by default the dataset is shuffled and split; you can provide the "+PRINT_PARAM_STRING("no_shuffle")+" option to avoid shuffling the " "data; an example to avoid shuffling of data is:" "\"+PRINT_CALL("preprocess_split", "input", "X", "training", "X_train", "test", "X_test", "test_ratio", 0.4, "no_shuffle", true)+"\" "If we had a dataset "+PRINT_DATASET("X")+" and associated labels "+PRINT_DATASET("y")+", and we wanted to split these into "+PRINT_DATASET("X_train")+", "+PRINT_DATASET("y_train")+", "+PRINT_DATASET("X_test")+", and "+PRINT_DATASET("y_test")+", with 30% " "of the data in the test set, we could run" "\"+PRINT_CALL("preprocess_split", "input", "X", "input_labels", "y", "test_ratio", 0.3, "training", "X_train", "training_labels", "y_train", "test", "X_test", "test_labels", "y_test"))
 
 BINDING_EXAMPLE ("To maintain the ratio of each class in the train and test sets, the"+PRINT_PARAM_STRING("stratify_data")+" option can be used." "\"+PRINT_CALL("preprocess_split", "input", "X", "training", "X_train", "test", "X_test", "test_ratio", 0.4, "stratify_data", true))
 
 BINDING_SEE_ALSO ("@preprocess_binarize", "#preprocess_binarize")
 
 BINDING_SEE_ALSO ("@preprocess_describe", "#preprocess_describe")
 
 BINDING_SEE_ALSO ("@preprocess_imputer", "#preprocess_imputer")
 
 PARAM_MATRIX_IN_REQ ("input", "Matrix containing data.", "i")
 
 PARAM_MATRIX_OUT ("training", "Matrix to save training data to.", "t")
 
 PARAM_MATRIX_OUT ("test", "Matrix to save test data to.", "T")
 
 PARAM_UMATRIX_IN ("input_labels", "Matrix containing labels.", "I")
 
 PARAM_UMATRIX_OUT ("training_labels", "Matrix to save train labels to.", "l")
 
 PARAM_UMATRIX_OUT ("test_labels", "Matrix to save test labels to.", "L")
 
 PARAM_DOUBLE_IN ("test_ratio", "Ratio of test set; if not set," "the ratio defaults to 0.2", "r", 0.2)
 
 PARAM_INT_IN ("seed", "Random seed (0 for std::time(NULL)).", "s", 0)
 
 PARAM_FLAG ("no_shuffle", "Avoid shuffling the data before splitting.", "S")
 

Detailed Description

Author
Keon Kim

A binding to split a dataset.

mlpack is free software; you may redistribute it and/or modify it under the terms of the 3-clause BSD license. You should have received a copy of the 3-clause BSD license along with mlpack. If not, see http://www.opensource.org/licenses/BSD-3-Clause for more information.