mlpack
string_encoding_impl.hpp
Go to the documentation of this file.
1 
13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_IMPL_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_IMPL_HPP
15 
16 // In case it hasn't been included yet.
17 #include "string_encoding.hpp"
18 #include <type_traits>
19 
20 namespace mlpack {
21 namespace data {
22 
23 template<typename EncodingPolicyType, typename DictionaryType>
24 template<typename ... ArgTypes>
26  ArgTypes&& ... args) :
27  encodingPolicy(std::forward<ArgTypes>(args)...)
28 { }
29 
30 template<typename EncodingPolicyType, typename DictionaryType>
32  EncodingPolicyType encodingPolicy) :
33  encodingPolicy(std::move(encodingPolicy))
34 { }
35 
36 template<typename EncodingPolicyType, typename DictionaryType>
38  StringEncoding& other) :
39  encodingPolicy(other.encodingPolicy),
40  dictionary(other.dictionary)
41 { }
42 
43 template<typename EncodingPolicyType, typename DictionaryType>
45  const StringEncoding& other) :
46  encodingPolicy(other.encodingPolicy),
47  dictionary(other.dictionary)
48 { }
49 
50 template<typename EncodingPolicyType, typename DictionaryType>
52  StringEncoding&& other) :
53  encodingPolicy(std::move(other.encodingPolicy)),
54  dictionary(std::move(other.dictionary))
55 { }
56 
57 template<typename EncodingPolicyType, typename DictionaryType>
59 {
60  dictionary.Clear();
61 }
62 
63 template<typename EncodingPolicyType, typename DictionaryType>
64 template<typename TokenizerType>
66  const std::string& input,
67  const TokenizerType& tokenizer)
68 {
69  boost::string_view strView(input);
70  auto token = tokenizer(strView);
71 
72  static_assert(
73  std::is_same<typename std::remove_reference<decltype(token)>::type,
74  typename std::remove_reference<typename DictionaryType::
75  TokenType>::type>::value,
76  "The dictionary token type doesn't match the return value type "
77  "of the tokenizer.");
78 
79  // The loop below adds the extracted tokens to the dictionary.
80  while (!tokenizer.IsTokenEmpty(token))
81  {
82  if (!dictionary.HasToken(token))
83  dictionary.AddToken(std::move(token));
84 
85  token = tokenizer(strView);
86  }
87 }
88 
89 template<typename EncodingPolicyType, typename DictionaryType>
90 template<typename OutputType, typename TokenizerType>
92  const std::vector<std::string>& input,
93  OutputType& output,
94  const TokenizerType& tokenizer)
95 {
96  EncodeHelper(input, output, tokenizer, encodingPolicy);
97 }
98 
99 
100 template<typename EncodingPolicyType, typename DictionaryType>
101 template<typename MatType, typename TokenizerType, typename PolicyType>
103 EncodeHelper(const std::vector<std::string>& input,
104  MatType& output,
105  const TokenizerType& tokenizer,
106  PolicyType& policy)
107 {
108  size_t numColumns = 0;
109 
110  policy.Reset();
111 
112  // The first pass adds the extracted tokens to the dictionary.
113  for (size_t i = 0; i < input.size(); ++i)
114  {
115  boost::string_view strView(input[i]);
116  auto token = tokenizer(strView);
117 
118  static_assert(
119  std::is_same<typename std::remove_reference<decltype(token)>::type,
120  typename std::remove_reference<typename DictionaryType::
121  TokenType>::type>::value,
122  "The dictionary token type doesn't match the return value type "
123  "of the tokenizer.");
124 
125  size_t numTokens = 0;
126 
127  while (!tokenizer.IsTokenEmpty(token))
128  {
129  if (!dictionary.HasToken(token))
130  dictionary.AddToken(std::move(token));
131 
132  policy.PreprocessToken(i, numTokens, dictionary.Value(token));
133 
134  token = tokenizer(strView);
135  numTokens++;
136  }
137 
138  numColumns = std::max(numColumns, numTokens);
139  }
140 
141  policy.InitMatrix(output, input.size(), numColumns, dictionary.Size());
142 
143  // The second pass writes the encoded values to the output.
144  for (size_t i = 0; i < input.size(); ++i)
145  {
146  boost::string_view strView(input[i]);
147  auto token = tokenizer(strView);
148  size_t numTokens = 0;
149 
150  while (!tokenizer.IsTokenEmpty(token))
151  {
152  policy.Encode(output, dictionary.Value(token), i, numTokens);
153  token = tokenizer(strView);
154  numTokens++;
155  }
156  }
157 }
158 
159 template<typename EncodingPolicyType, typename DictionaryType>
160 template<typename TokenizerType, typename PolicyType, typename ElemType>
162 EncodeHelper(const std::vector<std::string>& input,
163  std::vector<std::vector<ElemType>>& output,
164  const TokenizerType& tokenizer,
165  PolicyType& policy,
166  typename std::enable_if<StringEncodingPolicyTraits<
167  PolicyType>::onePassEncoding>::type*)
168 {
169  policy.Reset();
170 
171  // The loop below extracts the tokens and writes the encoded values
172  // at once.
173  for (size_t i = 0; i < input.size(); ++i)
174  {
175  boost::string_view strView(input[i]);
176  auto token = tokenizer(strView);
177 
178  static_assert(
179  std::is_same<typename std::remove_reference<decltype(token)>::type,
180  typename std::remove_reference<typename DictionaryType::
181  TokenType>::type>::value,
182  "The dictionary token type doesn't match the return value type "
183  "of the tokenizer.");
184 
185  output.emplace_back();
186 
187  while (!tokenizer.IsTokenEmpty(token))
188  {
189  if (dictionary.HasToken(token))
190  policy.Encode(output[i], dictionary.Value(token));
191  else
192  policy.Encode(output[i], dictionary.AddToken(std::move(token)));
193 
194  token = tokenizer(strView);
195  }
196  }
197 }
198 
199 template<typename EncodingPolicyType, typename DictionaryType>
200 template<typename Archive>
202  Archive& ar, const uint32_t /* version */)
203 {
204  ar(CEREAL_NVP(encodingPolicy));
205  ar(CEREAL_NVP(dictionary));
206 }
207 
208 } // namespace data
209 } // namespace mlpack
210 
211 #endif
This is a template struct that provides some information about various encoding policies.
Definition: policy_traits.hpp:27
void CreateMap(const std::string &input, const TokenizerType &tokenizer)
Initialize the dictionary using the given corpus.
Definition: string_encoding_impl.hpp:65
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
Definition: pointer_wrapper.hpp:23
The class translates a set of strings into numbers using various encoding algorithms.
Definition: string_encoding.hpp:35
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
Definition: string_encoding_impl.hpp:201
Definition: string_view.hpp:60
StringEncoding(ArgTypes &&... args)
Pass the given arguments to the policy constructor and create the StringEncoding object using the pol...
Definition: string_encoding_impl.hpp:25
void Clear()
Clear the dictionary.
Definition: string_encoding_impl.hpp:58
void Encode(const std::vector< std::string > &input, OutputType &output, const TokenizerType &tokenizer)
Encode the given text and write the result to the given output.
Definition: string_encoding_impl.hpp:91