|
| BINDING_NAME ("Split Data") |
|
| BINDING_SHORT_DESC ("A utility to split data into a training and testing dataset. This can " "also split labels according to the same split.") |
|
| BINDING_LONG_DESC ("This utility takes a dataset and optionally labels and splits them into a " "training set and a test set. Before the split, the points in the dataset " "are randomly reordered. The percentage of the dataset to be used as the " "test set can be specified with the "+PRINT_PARAM_STRING("test_ratio")+" parameter; the default is 0.2 (20%)." "\" "The output training and test matrices may be saved with the "+PRINT_PARAM_STRING("training")+" and "+PRINT_PARAM_STRING("test")+" output parameters." "\" "Optionally, labels can also be split along with the data by specifying " "the "+PRINT_PARAM_STRING("input_labels")+" parameter. Splitting " "labels works the same way as splitting the data. The output training and " "test labels may be saved with the "+PRINT_PARAM_STRING("training_labels")+" and "+PRINT_PARAM_STRING("test_labels")+" output parameters, respectively.") |
|
| BINDING_EXAMPLE ("So, a simple example where we want to split the dataset "+PRINT_DATASET("X")+" into "+PRINT_DATASET("X_train")+" and "+PRINT_DATASET("X_test")+" with 60% of the data in the training set and " "40% of the dataset in the test set, we could run " "\"+PRINT_CALL("preprocess_split", "input", "X", "training", "X_train", "test", "X_test", "test_ratio", 0.4)+"\" "Also by default the dataset is shuffled and split; you can provide the "+PRINT_PARAM_STRING("no_shuffle")+" option to avoid shuffling the " "data; an example to avoid shuffling of data is:" "\"+PRINT_CALL("preprocess_split", "input", "X", "training", "X_train", "test", "X_test", "test_ratio", 0.4, "no_shuffle", true)+"\" "If we had a dataset "+PRINT_DATASET("X")+" and associated labels "+PRINT_DATASET("y")+", and we wanted to split these into "+PRINT_DATASET("X_train")+", "+PRINT_DATASET("y_train")+", "+PRINT_DATASET("X_test")+", and "+PRINT_DATASET("y_test")+", with 30% " "of the data in the test set, we could run" "\"+PRINT_CALL("preprocess_split", "input", "X", "input_labels", "y", "test_ratio", 0.3, "training", "X_train", "training_labels", "y_train", "test", "X_test", "test_labels", "y_test")) |
|
| BINDING_EXAMPLE ("To maintain the ratio of each class in the train and test sets, the"+PRINT_PARAM_STRING("stratify_data")+" option can be used." "\"+PRINT_CALL("preprocess_split", "input", "X", "training", "X_train", "test", "X_test", "test_ratio", 0.4, "stratify_data", true)) |
|
| BINDING_SEE_ALSO ("@preprocess_binarize", "#preprocess_binarize") |
|
| BINDING_SEE_ALSO ("@preprocess_describe", "#preprocess_describe") |
|
| BINDING_SEE_ALSO ("@preprocess_imputer", "#preprocess_imputer") |
|
| PARAM_MATRIX_IN_REQ ("input", "Matrix containing data.", "i") |
|
| PARAM_MATRIX_OUT ("training", "Matrix to save training data to.", "t") |
|
| PARAM_MATRIX_OUT ("test", "Matrix to save test data to.", "T") |
|
| PARAM_UMATRIX_IN ("input_labels", "Matrix containing labels.", "I") |
|
| PARAM_UMATRIX_OUT ("training_labels", "Matrix to save train labels to.", "l") |
|
| PARAM_UMATRIX_OUT ("test_labels", "Matrix to save test labels to.", "L") |
|
| PARAM_DOUBLE_IN ("test_ratio", "Ratio of test set; if not set," "the ratio defaults to 0.2", "r", 0.2) |
|
| PARAM_INT_IN ("seed", "Random seed (0 for std::time(NULL)).", "s", 0) |
|
| PARAM_FLAG ("no_shuffle", "Avoid shuffling the data before splitting.", "S") |
|
- Author
- Keon Kim
A binding to split a dataset.
mlpack is free software; you may redistribute it and/or modify it under the terms of the 3-clause BSD license. You should have received a copy of the 3-clause BSD license along with mlpack. If not, see http://www.opensource.org/licenses/BSD-3-Clause for more information.