mlpack
string_encoding_dictionary.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_DICTIONARY_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_DICTIONARY_HPP
15 
16 #include <mlpack/prereqs.hpp>
18 #include <unordered_map>
19 #include <deque>
20 #include <array>
21 
22 namespace mlpack {
23 namespace data {
24 
31 template<typename Token>
33 {
34  public:
36  using MapType = std::unordered_map<Token, size_t>;
37 
39  using TokenType = Token;
40 
46  bool HasToken(const Token& token) const
47  {
48  return mapping.find(token) != mapping.end();
49  }
50 
58  template<typename T>
59  size_t AddToken(T&& token)
60  {
61  size_t size = mapping.size();
62 
63  mapping[std::forward<T>(token)] = ++size;
64 
65  return size;
66  }
67 
74  size_t Value(const Token& token) const
75  {
76  return mapping.at(token);
77  }
78 
80  size_t Size() const { return mapping.size(); }
81 
83  void Clear()
84  {
85  mapping.clear();
86  }
87 
89  const MapType& Mapping() const { return mapping; }
91  MapType& Mapping() { return mapping; }
92 
96  template<typename Archive>
97  void serialize(Archive& ar, const uint32_t /* version */)
98  {
99  ar(CEREAL_NVP(mapping));
100  }
101 
102  private:
104  MapType mapping;
105 };
106 
107 /*
108  * Specialization of the StringEncodingDictionary class for boost::string_view.
109  */
110 template<>
111 class StringEncodingDictionary<boost::string_view>
112 {
113  public:
115  using MapType = std::unordered_map<
117  size_t,
119 
121  using TokenType = boost::string_view;
122 
124  StringEncodingDictionary() = default;
125 
128  tokens(other.tokens)
129  {
130  for (const std::string& token : tokens)
131  mapping[token] = other.mapping.at(token);
132  }
133 
136 
139  {
140  tokens = other.tokens;
141  mapping.clear();
142 
143  for (const std::string& token : tokens)
144  mapping[token] = other.mapping.at(token);
145 
146  return *this;
147  }
148 
150  StringEncodingDictionary& operator=(
151  StringEncodingDictionary&& other) = default;
152 
158  bool HasToken(const boost::string_view token) const
159  {
160  return mapping.find(token) != mapping.end();
161  }
162 
170  size_t AddToken(const boost::string_view token)
171  {
172  tokens.emplace_back(token);
173 
174  size_t size = mapping.size();
175 
176  mapping[tokens.back()] = ++size;
177 
178  return size;
179  }
180 
187  size_t Value(const boost::string_view token) const
188  {
189  return mapping.at(token);
190  }
191 
193  size_t Size() const { return mapping.size(); }
194 
196  void Clear()
197  {
198  mapping.clear();
199  tokens.clear();
200  }
201 
203  const std::deque<std::string>& Tokens() const { return tokens; }
205  std::deque<std::string>& Tokens() { return tokens; }
206 
208  const MapType& Mapping() const { return mapping; }
210  MapType& Mapping() { return mapping; }
211 
215  template<typename Archive>
216  void serialize(Archive& ar, const uint32_t /* version */)
217  {
218  size_t numTokens = tokens.size();
219 
220  ar(CEREAL_NVP(numTokens));
221 
222  if (cereal::is_loading<Archive>())
223  {
224  tokens.resize(numTokens);
225 
226  for (std::string& token : tokens)
227  {
228  ar(CEREAL_NVP(token));
229 
230  size_t tokenValue = 0;
231  ar(CEREAL_NVP(tokenValue));
232  mapping[token] = tokenValue;
233  }
234  }
235  if (cereal::is_saving<Archive>())
236  {
237  for (std::string& token : tokens)
238  {
239  ar(CEREAL_NVP(token));
240 
241  size_t tokenValue = mapping.at(token);
242  ar(CEREAL_NVP(tokenValue));
243  }
244  }
245  }
246 
247  private:
249  std::deque<std::string> tokens;
250 
252  MapType mapping;
253 };
254 
255 template<>
257 {
258  public:
260  using MapType = std::array<size_t, 1 << CHAR_BIT>;
261 
263  using TokenType = int;
264 
267  size(0)
268  {
269  mapping.fill(0);
270  }
271 
278  bool HasToken(const int token) const
279  {
280  return mapping[token] > 0;
281  }
282 
291  size_t AddToken(const int token)
292  {
293  mapping[token] = ++size;
294 
295  return size;
296  }
297 
305  size_t Value(const int token) const
306  {
307  return mapping[token];
308  }
309 
311  size_t Size() const
312  {
313  return size;
314  }
315 
317  void Clear()
318  {
319  mapping.fill(0);
320  }
321 
323  const MapType& Mapping() const { return mapping; }
325  MapType& Mapping() { return mapping; }
326 
330  template<typename Archive>
331  void serialize(Archive& ar, const uint32_t /* version */)
332  {
333  ar(CEREAL_NVP(mapping));
334  ar(CEREAL_NVP(size));
335  }
336 
337  private:
339  MapType mapping;
340 
342  size_t size;
343 };
344 
345 } // namespace data
346 } // namespace mlpack
347 
348 #endif
size_t AddToken(const boost::string_view token)
The function adds the given token to the dictionary and assigns a label to the token.
Definition: string_encoding_dictionary.hpp:170
MapType & Mapping()
Modify the mapping.
Definition: string_encoding_dictionary.hpp:325
const std::deque< std::string > & Tokens() const
Get the tokens.
Definition: string_encoding_dictionary.hpp:203
bool HasToken(const boost::string_view token) const
The function returns true if the dictionary contains the given token.
Definition: string_encoding_dictionary.hpp:158
Definition: bernoulli.hpp:17
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
Definition: boost_backport_string_view.hpp:33
std::unordered_map< Token, size_t > MapType
A convenient alias for the internal type of the map.
Definition: string_encoding_dictionary.hpp:36
const MapType & Mapping() const
Get the mapping.
Definition: string_encoding_dictionary.hpp:89
The core includes that mlpack expects; standard C++ includes and Armadillo.
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
Definition: string_encoding_dictionary.hpp:97
const MapType & Mapping() const
Get the mapping.
Definition: string_encoding_dictionary.hpp:208
int TokenType
The type of the token that the dictionary stores.
Definition: string_encoding_dictionary.hpp:263
size_t AddToken(T &&token)
The function adds the given token to the dictionary and assigns a label to the token.
Definition: string_encoding_dictionary.hpp:59
StringEncodingDictionary(const StringEncodingDictionary &other)
Copy the class from the given object.
Definition: string_encoding_dictionary.hpp:127
StringEncodingDictionary & operator=(const StringEncodingDictionary &other)
Copy the class from the given object.
Definition: string_encoding_dictionary.hpp:138
This class provides a dictionary interface for the purpose of string encoding.
Definition: string_encoding_dictionary.hpp:32
size_t Size() const
Get the size of the dictionary.
Definition: string_encoding_dictionary.hpp:311
MapType & Mapping()
Modify the mapping.
Definition: string_encoding_dictionary.hpp:91
Token TokenType
The type of the token that the dictionary stores.
Definition: string_encoding_dictionary.hpp:39
bool HasToken(const int token) const
The function returns true if the dictionary contains the given token.
Definition: string_encoding_dictionary.hpp:278
std::array< size_t, 1<< CHAR_BIT > MapType
A convenient alias for the internal type of the map.
Definition: string_encoding_dictionary.hpp:260
size_t AddToken(const int token)
The function adds the given token to the dictionary and assigns a label to the token.
Definition: string_encoding_dictionary.hpp:291
std::unordered_map< boost::string_view, size_t, boost::hash< boost::string_view > > MapType
A convenient alias for the internal type of the map.
Definition: string_encoding_dictionary.hpp:118
size_t Value(const Token &token) const
The function returns the label assigned to the given token.
Definition: string_encoding_dictionary.hpp:74
MapType & Mapping()
Modify the mapping.
Definition: string_encoding_dictionary.hpp:210
size_t Value(const int token) const
The function returns the label assigned to the given token.
Definition: string_encoding_dictionary.hpp:305
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
Definition: string_encoding_dictionary.hpp:331
void Clear()
Clear the dictionary.
Definition: string_encoding_dictionary.hpp:196
Definition: string_view.hpp:60
std::deque< std::string > & Tokens()
Modify the tokens.
Definition: string_encoding_dictionary.hpp:205
const MapType & Mapping() const
Get the mapping.
Definition: string_encoding_dictionary.hpp:323
size_t Size() const
Get the size of the dictionary.
Definition: string_encoding_dictionary.hpp:193
void Clear()
Clear the dictionary.
Definition: string_encoding_dictionary.hpp:83
size_t Value(const boost::string_view token) const
The function returns the label assigned to the given token.
Definition: string_encoding_dictionary.hpp:187
void Clear()
Clear the dictionary.
Definition: string_encoding_dictionary.hpp:317
size_t Size() const
Get the size of the dictionary.
Definition: string_encoding_dictionary.hpp:80
bool HasToken(const Token &token) const
The function returns true if the dictionary contains the given token.
Definition: string_encoding_dictionary.hpp:46
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
Definition: string_encoding_dictionary.hpp:216
StringEncodingDictionary()
Construct the default class.
Definition: string_encoding_dictionary.hpp:266