mlpack
Functions
kmeans_main.cpp File Reference
#include <mlpack/prereqs.hpp>
#include <mlpack/core/util/io.hpp>
#include <mlpack/core/util/mlpack_main.hpp>
#include "kmeans.hpp"
#include "allow_empty_clusters.hpp"
#include "kill_empty_clusters.hpp"
#include "refined_start.hpp"
#include "kmeans_plus_plus_initialization.hpp"
#include "elkan_kmeans.hpp"
#include "hamerly_kmeans.hpp"
#include "pelleg_moore_kmeans.hpp"
#include "dual_tree_kmeans.hpp"
Include dependency graph for kmeans_main.cpp:
This graph shows which files directly or indirectly include this file:

Functions

 BINDING_NAME ("K-Means Clustering")
 
 BINDING_SHORT_DESC ("An implementation of several strategies for efficient k-means clustering. " "Given a dataset and a value of k, this computes and returns a k-means " "clustering on that data.")
 
 BINDING_LONG_DESC ("This program performs K-Means clustering on the given dataset. It can " "return the learned cluster assignments, and the centroids of the clusters." " Empty clusters are not allowed by default; when a cluster becomes empty," " the point furthest from the centroid of the cluster with maximum variance" " is taken to fill that cluster." "\" "Optionally, the strategy to choose initial centroids can be specified. " "The k-means++ algorithm can be used to choose initial centroids with " "the "+PRINT_PARAM_STRING("kmeans_plus_plus")+" parameter. The " "Bradley and Fayyad approach (\efining initial points for k-means " "clustering\ 1998) can be used to select initial points by specifying " "the "+PRINT_PARAM_STRING("refined_start")+" parameter. This approach " "works by taking random samplings of the dataset; to specify the number of " "samplings, the "+PRINT_PARAM_STRING("samplings")+" parameter is used, " "and to specify the percentage of the dataset to be used in each sample, " "the "+PRINT_PARAM_STRING("percentage")+" parameter is used (it should " "be a value between 0.0 and 1.0)." "\" "There are several options available for the algorithm used for each Lloyd " "iteration, specified with the "+PRINT_PARAM_STRING("algorithm")+" " " option. The standard O(kN) approach can be used ('naive'). Other " "options include the Pelleg-Moore tree-based algorithm ('pelleg-moore'), " "Elkan's triangle-inequality based algorithm ('elkan'), Hamerly's " "modification to Elkan's algorithm ('hamerly'), the dual-tree k-means " "algorithm ('dualtree'), and the dual-tree k-means algorithm using the " "cover tree ('dualtree-covertree')." "\" "The behavior for when an empty cluster is encountered can be modified with" " the "+PRINT_PARAM_STRING("allow_empty_clusters")+" option. When " "this option is specified and there is a cluster owning no points at the " "end of an iteration, that cluster's centroid will simply remain in its " "position from the previous iteration. If the "+PRINT_PARAM_STRING("kill_empty_clusters")+" option is specified, then " "when a cluster owns no points at the end of an iteration, the cluster " "centroid is simply filled with DBL_MAX, killing it and effectively " "reducing k for the rest of the computation. Note that the default option " "when neither empty cluster option is specified can be time-consuming to " "calculate; therefore, specifying either of these parameters will often " "accelerate runtime." "\" "Initial clustering assignments may be specified using the "+PRINT_PARAM_STRING("initial_centroids")+" parameter, and the maximum " "number of iterations may be specified with the "+PRINT_PARAM_STRING("max_iterations")+" parameter.")
 
 BINDING_EXAMPLE ("As an example, to use Hamerly's algorithm to perform k-means clustering " "with k=10 on the dataset "+PRINT_DATASET("data")+", saving the " "centroids to "+PRINT_DATASET("centroids")+" and the assignments for " "each point to "+PRINT_DATASET("assignments")+", the following " "command could be used:" "\"+PRINT_CALL("kmeans", "input", "data", "clusters", 10, "output", "assignments", "centroid", "centroids")+"\" "To run k-means on that same dataset with initial centroids specified in "+PRINT_DATASET("initial")+" with a maximum of 500 iterations, " "storing the output centroids in "+PRINT_DATASET("final")+" the " "following command may be used:" "\"+PRINT_CALL("kmeans", "input", "data", "initial_centroids", "initial", "clusters", 10, "max_iterations", 500, "centroid", "final"))
 
 BINDING_SEE_ALSO ("K-Means tutorial", "@doxygen/kmtutorial.html")
 
 BINDING_SEE_ALSO ("@dbscan", "#dbscan")
 
 BINDING_SEE_ALSO ("k-means++", "https://en.wikipedia.org/wiki/K-means%2B%2B")
 
 BINDING_SEE_ALSO ("Using the triangle inequality to accelerate k-means (pdf)", "http://www.aaai.org/Papers/ICML/2003/ICML03-022.pdf")
 
 BINDING_SEE_ALSO ("Making k-means even faster (pdf)", "http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.586.2554" "&rep=rep1&type=pdf")
 
 BINDING_SEE_ALSO ("Accelerating exact k-means algorithms with geometric" " reasoning (pdf)", "http://reports-archive.adm.cs.cmu.edu/anon/anon" "/usr/ftp/usr0/ftp/2000/CMU-CS-00-105.pdf")
 
 BINDING_SEE_ALSO ("A dual-tree algorithm for fast k-means clustering with large " "k (pdf)", "http://www.ratml.org/pub/pdf/2017dual.pdf")
 
 BINDING_SEE_ALSO ("mlpack::kmeans::KMeans class documentation", "@doxygen/classmlpack_1_1kmeans_1_1KMeans.html")
 
 PARAM_MATRIX_IN_REQ ("input", "Input dataset to perform clustering on.", "i")
 
 PARAM_INT_IN_REQ ("clusters", "Number of clusters to find (0 autodetects from " "initial centroids).", "c")
 
 PARAM_FLAG ("in_place", "If specified, a column containing the learned cluster " "assignments will be added to the input dataset file. In this case, " "--output_file is overridden. (Do not use in Python.)", "P")
 
 PARAM_MATRIX_OUT ("output", "Matrix to store output labels or labeled data to.", "o")
 
 PARAM_MATRIX_OUT ("centroid", "If specified, the centroids of each cluster will " " be written to the given file.", "C")
 
 PARAM_FLAG ("allow_empty_clusters", "Allow empty clusters to be persist.", "e")
 
 PARAM_FLAG ("kill_empty_clusters", "Remove empty clusters when they occur.", "E")
 
 PARAM_FLAG ("labels_only", "Only output labels into output file.", "l")
 
 PARAM_INT_IN ("max_iterations", "Maximum number of iterations before k-means " "terminates.", "m", 1000)
 
 PARAM_INT_IN ("seed", "Random seed. If 0, 'std::time(NULL)' is used.", "s", 0)
 
 PARAM_MATRIX_IN ("initial_centroids", "Start with the specified initial " "centroids.", "I")
 
 PARAM_FLAG ("refined_start", "Use the refined initial point strategy by Bradley " "and Fayyad to choose initial points.", "r")
 
 PARAM_INT_IN ("samplings", "Number of samplings to perform for refined start " "(use when --refined_start is specified).", "S", 100)
 
 PARAM_DOUBLE_IN ("percentage", "Percentage of dataset to use for each refined " "start sampling (use when --refined_start is specified).", "p", 0.02)
 
 PARAM_FLAG ("kmeans_plus_plus", "Use the k-means++ initialization strategy to " "choose initial points.", "K")
 
 PARAM_STRING_IN ("algorithm", "Algorithm to use for the Lloyd iteration " "('naive', 'pelleg-moore', 'elkan', 'hamerly', 'dualtree', or " "'dualtree-covertree').", "a", "naive")
 
template<typename InitialPartitionPolicy >
void FindEmptyClusterPolicy (const InitialPartitionPolicy &ipp)
 
template<typename InitialPartitionPolicy , typename EmptyClusterPolicy >
void FindLloydStepType (const InitialPartitionPolicy &ipp)
 
template<typename InitialPartitionPolicy , typename EmptyClusterPolicy , template< class, class > class LloydStepType>
void RunKMeans (const InitialPartitionPolicy &ipp)
 

Detailed Description

Author
Ryan Curtin

Executable for running K-Means.

mlpack is free software; you may redistribute it and/or modify it under the terms of the 3-clause BSD license. You should have received a copy of the 3-clause BSD license along with mlpack. If not, see http://www.opensource.org/licenses/BSD-3-Clause for more information.