13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP 14 #define MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP 76 const bool smoothIdf =
true) :
86 tokensFrequences.clear();
87 numContainingStrings.clear();
103 template<
typename MatType>
105 const size_t datasetSize,
107 const size_t dictionarySize)
109 output.zeros(dictionarySize, datasetSize);
126 template<
typename ElemType>
127 static void InitMatrix(std::vector<std::vector<ElemType>>& output,
128 const size_t datasetSize,
130 const size_t dictionarySize)
132 output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
147 template<
typename MatType>
153 const typename MatType::elem_type tf =
154 TermFrequency<typename MatType::elem_type>(
155 tokensFrequences[line][value], linesSizes[line]);
157 const typename MatType::elem_type idf =
158 InverseDocumentFrequency<typename MatType::elem_type>(
159 output.n_cols, numContainingStrings[value]);
161 output(value - 1, line) = tf * idf;
179 template<
typename ElemType>
180 void Encode(std::vector<std::vector<ElemType>>& output,
185 const ElemType tf = TermFrequency<ElemType>(
186 tokensFrequences[line][value], linesSizes[line]);
188 const ElemType idf = InverseDocumentFrequency<ElemType>(
189 output.size(), numContainingStrings[value]);
191 output[line][value - 1] = tf * idf;
202 void PreprocessToken(
const size_t line,
206 if (line >= tokensFrequences.size())
208 linesSizes.resize(line + 1);
209 tokensFrequences.resize(line + 1);
212 tokensFrequences[line][value]++;
214 if (tokensFrequences[line][value] == 1)
215 numContainingStrings[value]++;
221 const std::vector<std::unordered_map<size_t, size_t>>&
226 return tokensFrequences;
232 return numContainingStrings;
238 return numContainingStrings;
242 const std::vector<size_t>&
LinesSizes()
const {
return linesSizes; }
259 template<
typename Archive>
262 ar(CEREAL_NVP(tfType));
263 ar(CEREAL_NVP(smoothIdf));
276 template<
typename ValueType>
277 ValueType TermFrequency(
const size_t numOccurrences,
278 const size_t numTokens)
282 case TfTypes::BINARY:
283 return numOccurrences > 0;
284 case TfTypes::RAW_COUNT:
285 return numOccurrences;
286 case TfTypes::TERM_FREQUENCY:
287 return static_cast<ValueType
>(numOccurrences) / numTokens;
288 case TfTypes::SUBLINEAR_TF:
289 return std::log(static_cast<ValueType>(numOccurrences)) + 1;
291 Log::Fatal <<
"Incorrect term frequency type!";
305 template<
typename ValueType>
306 ValueType InverseDocumentFrequency(
const size_t totalNumLines,
307 const size_t numOccurrences)
311 return std::log(static_cast<ValueType>(totalNumLines + 1) /
312 (1 + numOccurrences)) + 1.0;
316 return std::log(static_cast<ValueType>(totalNumLines) /
317 numOccurrences) + 1.0;
323 std::vector<std::unordered_map<size_t, size_t>> tokensFrequences;
328 std::unordered_map<size_t, size_t> numContainingStrings;
330 std::vector<size_t> linesSizes;
343 template<
typename TokenType>
TfIdfEncodingPolicy(const TfTypes tfType=TfTypes::RAW_COUNT, const bool smoothIdf=true)
Construct this using the term frequency type and the inverse document frequency type.
Definition: tf_idf_encoding_policy.hpp:75
bool & SmoothIdf()
Modify the idf algorithm type (whether it's smooth or not).
Definition: tf_idf_encoding_policy.hpp:254
const std::vector< size_t > & LinesSizes() const
Return the lines sizes.
Definition: tf_idf_encoding_policy.hpp:242
static MLPACK_EXPORT util::PrefixedOutStream Fatal
Prints fatal messages prefixed with [FATAL], then terminates the program.
Definition: log.hpp:90
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: cv.hpp:1
static void InitMatrix(MatType &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
Definition: tf_idf_encoding_policy.hpp:104
void Encode(std::vector< std::vector< ElemType >> &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
Definition: tf_idf_encoding_policy.hpp:180
TfTypes
Enum class used to identify the type of the term frequency statistics.
Definition: tf_idf_encoding_policy.hpp:53
The core includes that mlpack expects; standard C++ includes and Armadillo.
bool SmoothIdf() const
Determine the idf algorithm type (whether it's smooth or not).
Definition: tf_idf_encoding_policy.hpp:252
static void InitMatrix(std::vector< std::vector< ElemType >> &output, const size_t datasetSize, const size_t, const size_t dictionarySize)
The function initializes the output matrix.
Definition: tf_idf_encoding_policy.hpp:127
void serialize(Archive &ar, const uint32_t)
Serialize the class to the given archive.
Definition: tf_idf_encoding_policy.hpp:260
void Reset()
Clear the necessary internal variables.
Definition: tf_idf_encoding_policy.hpp:84
void Encode(MatType &output, const size_t value, const size_t line, const size_t)
The function performs the TfIdf encoding algorithm i.e.
Definition: tf_idf_encoding_policy.hpp:148
The class translates a set of strings into numbers using various encoding algorithms.
Definition: string_encoding.hpp:35
This class provides a dictionary interface for the purpose of string encoding.
Definition: string_encoding_dictionary.hpp:32
std::vector< size_t > & LinesSizes()
Modify the lines sizes.
Definition: tf_idf_encoding_policy.hpp:244
TfTypes & TfType()
Modify the term frequency type.
Definition: tf_idf_encoding_policy.hpp:249
Definition of the TfIdfEncodingPolicy class.
Definition: tf_idf_encoding_policy.hpp:35
const std::unordered_map< size_t, size_t > & NumContainingStrings() const
Get the number of containing strings depending on the given token.
Definition: tf_idf_encoding_policy.hpp:230
TfTypes TfType() const
Return the term frequency type.
Definition: tf_idf_encoding_policy.hpp:247
const std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences() const
Return token frequencies.
Definition: tf_idf_encoding_policy.hpp:222
std::unordered_map< size_t, size_t > & NumContainingStrings()
Modify the number of containing strings depending on the given token.
Definition: tf_idf_encoding_policy.hpp:236
std::vector< std::unordered_map< size_t, size_t > > & TokensFrequences()
Modify token frequencies.
Definition: tf_idf_encoding_policy.hpp:224