mlpack
one_hot_encoding_impl.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_ONE_HOT_ENCODING_IMPL_HPP
14 #define MLPACK_CORE_DATA_ONE_HOT_ENCODING_IMPL_HPP
15 
16 // In case it hasn't been included yet.
17 #include "one_hot_encoding.hpp"
18 
19 namespace mlpack {
20 namespace data {
21 
32 template<typename RowType, typename MatType>
33 void OneHotEncoding(const RowType& labelsIn,
34  MatType& output)
35 {
36  arma::Row<size_t> labels;
37  labels.set_size(labelsIn.n_elem);
38 
39  // Loop over the input labels, and develop the mapping.
40  // Map for labelsIn to labels.
41  std::unordered_map<typename MatType::elem_type, size_t> labelMap;
42  size_t curLabel = 0;
43  for (size_t i = 0; i < labelsIn.n_elem; ++i)
44  {
45  // If labelsIn[i] is already in the map, use the existing label.
46  if (labelMap.count(labelsIn[i]) != 0)
47  {
48  labels[i] = labelMap[labelsIn[i]] - 1;
49  }
50  else
51  {
52  // If labelsIn[i] not there then add it to the map.
53  labelMap[labelsIn[i]] = curLabel + 1;
54  labels[i] = curLabel;
55  ++curLabel;
56  }
57  }
58  // Resize output matrix to necessary size, and fill it with zeros.
59  output.zeros(curLabel, labelsIn.n_elem);
60  // Fill ones in at the required places.
61  for (size_t i = 0; i < labelsIn.n_elem; ++i)
62  {
63  output(labels[i], i) = 1;
64  }
65  labelMap.clear();
66 }
67 
77 template<typename eT>
78 void OneHotEncoding(const arma::Mat<eT>& input,
79  const arma::Col<size_t>& indices,
80  arma::Mat<eT>& output)
81 {
82  // Handle the edge case where there is nothing to encode.
83  if (indices.n_elem == 0)
84  {
85  output = input;
86  return;
87  }
88 
89  // First, we need to compute the size of the output matrix.
90 
91  // This vector will eventually hold the offsets for each dimension in the
92  // one-hot encoded matrix, but first it will just hold the counts of
93  // dimensions for each dimension.
94  arma::Col<size_t> dimensionOffsets(input.n_rows, arma::fill::ones);
95  // This will hold the mappings from a value that should be one-hot encoded to
96  // the index of the dimension it should take.
97  std::unordered_map<size_t, std::unordered_map<eT, size_t>> mappings;
98  for (size_t i = 0; i < indices.n_elem; ++i)
99  {
100  dimensionOffsets[indices[i]] = 0;
101  mappings.insert(
102  std::make_pair(indices[i], std::unordered_map<eT, size_t>()));
103  }
104 
105  for (size_t col = 0; col < input.n_cols; ++col)
106  {
107  for (size_t row = 0; row < input.n_rows; ++row)
108  {
109  if (mappings.count(row) != 0)
110  {
111  // We have to one-hot encode this point.
112  if (mappings[row].count(input(row, col)) == 0)
113  mappings[row][input(row, col)] = dimensionOffsets[row]++;
114  }
115  }
116  }
117 
118  // Turn the dimension counts into offsets. Note that the last element is the
119  // total number of dimensions, and the first element is the offset for
120  // dimension *2* (not 1).
121  for (size_t i = 1; i < dimensionOffsets.n_elem; ++i)
122  dimensionOffsets[i] += dimensionOffsets[i - 1];
123 
124  // Now, initialize the output matrix to the right size.
125  output.zeros(dimensionOffsets[dimensionOffsets.n_elem - 1], input.n_cols);
126 
127  // Finally, one-hot encode the matrix.
128  for (size_t col = 0; col < input.n_cols; ++col)
129  {
130  for (size_t row = 0; row < input.n_rows; ++row)
131  {
132  const size_t dimOffset = (row == 0) ? 0 : dimensionOffsets[row - 1];
133  if (mappings.count(row) != 0)
134  {
135  output(dimOffset + mappings[row][input(row, col)], col) = eT(1);
136  }
137  else
138  {
139  // No need for one-hot encoding.
140  output(dimOffset, col) = input(row, col);
141  }
142  }
143  }
144 }
145 
156 template<typename eT>
157 void OneHotEncoding(const arma::Mat<eT>& input,
158  arma::Mat<eT>& output,
159  const data::DatasetInfo& datasetInfo)
160 {
161  std::vector<size_t> indices;
162  for (size_t i = 0; i < datasetInfo.Dimensionality(); ++i)
163  {
164  if (datasetInfo.Type(i) == data::Datatype::categorical)
165  {
166  indices.push_back(i);
167  }
168  }
169  OneHotEncoding(input, arma::Col<size_t>(indices), output);
170 }
171 
172 } // namespace data
173 } // namespace mlpack
174 
175 #endif
Auxiliary information for a dataset, including mappings to/from strings (or other types) and the data...
Definition: dataset_mapper.hpp:41
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
void OneHotEncoding(const RowType &labelsIn, MatType &output)
Given a set of labels of a particular datatype, convert them to binary vector.
Definition: one_hot_encoding_impl.hpp:33
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
Definition: dataset_mapper_impl.hpp:228
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
Definition: dataset_mapper_impl.hpp:196