13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_IMPL_HPP 14 #define MLPACK_CORE_DATA_STRING_ENCODING_IMPL_HPP 18 #include <type_traits> 23 template<
typename EncodingPolicyType,
typename DictionaryType>
24 template<
typename ... ArgTypes>
26 ArgTypes&& ... args) :
27 encodingPolicy(
std::forward<ArgTypes>(args)...)
30 template<
typename EncodingPolicyType,
typename DictionaryType>
32 EncodingPolicyType encodingPolicy) :
33 encodingPolicy(
std::move(encodingPolicy))
36 template<
typename EncodingPolicyType,
typename DictionaryType>
39 encodingPolicy(other.encodingPolicy),
40 dictionary(other.dictionary)
43 template<
typename EncodingPolicyType,
typename DictionaryType>
46 encodingPolicy(other.encodingPolicy),
47 dictionary(other.dictionary)
50 template<
typename EncodingPolicyType,
typename DictionaryType>
53 encodingPolicy(
std::move(other.encodingPolicy)),
54 dictionary(
std::move(other.dictionary))
57 template<
typename EncodingPolicyType,
typename DictionaryType>
63 template<
typename EncodingPolicyType,
typename DictionaryType>
64 template<
typename TokenizerType>
66 const std::string& input,
67 const TokenizerType& tokenizer)
70 auto token = tokenizer(strView);
73 std::is_same<
typename std::remove_reference<decltype(token)>::type,
74 typename std::remove_reference<
typename DictionaryType::
75 TokenType>::type>::value,
76 "The dictionary token type doesn't match the return value type " 80 while (!tokenizer.IsTokenEmpty(token))
82 if (!dictionary.HasToken(token))
83 dictionary.AddToken(std::move(token));
85 token = tokenizer(strView);
89 template<
typename EncodingPolicyType,
typename DictionaryType>
90 template<
typename OutputType,
typename TokenizerType>
92 const std::vector<std::string>& input,
94 const TokenizerType& tokenizer)
96 EncodeHelper(input, output, tokenizer, encodingPolicy);
100 template<
typename EncodingPolicyType,
typename DictionaryType>
101 template<
typename MatType,
typename TokenizerType,
typename PolicyType>
105 const TokenizerType& tokenizer,
108 size_t numColumns = 0;
113 for (
size_t i = 0; i < input.size(); ++i)
116 auto token = tokenizer(strView);
119 std::is_same<
typename std::remove_reference<decltype(token)>::type,
120 typename std::remove_reference<
typename DictionaryType::
121 TokenType>::type>::value,
122 "The dictionary token type doesn't match the return value type " 123 "of the tokenizer.");
125 size_t numTokens = 0;
127 while (!tokenizer.IsTokenEmpty(token))
129 if (!dictionary.HasToken(token))
130 dictionary.AddToken(std::move(token));
132 policy.PreprocessToken(i, numTokens, dictionary.Value(token));
134 token = tokenizer(strView);
138 numColumns = std::max(numColumns, numTokens);
141 policy.InitMatrix(output, input.size(), numColumns, dictionary.Size());
144 for (
size_t i = 0; i < input.size(); ++i)
147 auto token = tokenizer(strView);
148 size_t numTokens = 0;
150 while (!tokenizer.IsTokenEmpty(token))
152 policy.Encode(output, dictionary.Value(token), i, numTokens);
153 token = tokenizer(strView);
159 template<
typename EncodingPolicyType,
typename DictionaryType>
160 template<
typename TokenizerType,
typename PolicyType,
typename ElemType>
163 std::vector<std::vector<ElemType>>& output,
164 const TokenizerType& tokenizer,
167 PolicyType>::onePassEncoding>::type*)
173 for (
size_t i = 0; i < input.size(); ++i)
176 auto token = tokenizer(strView);
179 std::is_same<
typename std::remove_reference<decltype(token)>::type,
180 typename std::remove_reference<
typename DictionaryType::
181 TokenType>::type>::value,
182 "The dictionary token type doesn't match the return value type " 183 "of the tokenizer.");
185 output.emplace_back();
187 while (!tokenizer.IsTokenEmpty(token))
189 if (dictionary.HasToken(token))
190 policy.Encode(output[i], dictionary.Value(token));
192 policy.Encode(output[i], dictionary.AddToken(std::move(token)));
194 token = tokenizer(strView);
199 template<
typename EncodingPolicyType,
typename DictionaryType>
200 template<
typename Archive>
202 Archive& ar,
const uint32_t )
204 ar(CEREAL_NVP(encodingPolicy));
205 ar(CEREAL_NVP(dictionary));
This is a template struct that provides some information about various encoding policies.
Definition: policy_traits.hpp:27
void CreateMap(const std::string &input, const TokenizerType &tokenizer)
Initialize the dictionary using the given corpus.
Definition: string_encoding_impl.hpp:65
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
Definition: pointer_wrapper.hpp:23
The class translates a set of strings into numbers using various encoding algorithms.
Definition: string_encoding.hpp:35
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
Definition: string_encoding_impl.hpp:201
Definition: string_view.hpp:60
StringEncoding(ArgTypes &&... args)
Pass the given arguments to the policy constructor and create the StringEncoding object using the pol...
Definition: string_encoding_impl.hpp:25
void Clear()
Clear the dictionary.
Definition: string_encoding_impl.hpp:58
void Encode(const std::vector< std::string > &input, OutputType &output, const TokenizerType &tokenizer)
Encode the given text and write the result to the given output.
Definition: string_encoding_impl.hpp:91