crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TopicModelling.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * TopicModelling.hpp
24  *
25  * Topic modelling using the Hierarchical Dirichlet Process (HDP) and
26  * Latent Dirichlet Allocation (LDA) algorithms.
27  *
28  * The former will be used if no fixed number of topics is given,
29  * the latter will be used if a fixed number of topics is given.
30  *
31  * Using tomoto, the underlying C++ API of tomotopy, see:
32  * https://bab2min.github.io/tomotopy/
33  *
34  * If you use the HDP topic modelling algorithm, please cite:
35  *
36  * Teh, Y. W., Jordan, M. I., Beal, M. J., & Blei, D. M. (2005). Sharing
37  * clusters among related groups: Hierarchical Dirichlet processes.
38  * In Advances in neural information processing systems, 1385–1392.
39  *
40  * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
41  * algorithms for topic models. Journal of Machine Learning Research,
42  * 10 (Aug), 1801–1828.
43  *
44  * If you use the LDA topic modelling algorithm, please cite:
45  *
46  * Blei, D.M., Ng, A.Y., & Jordan, M.I. (2003). Latent dirichlet
47  * allocation. Journal of machine Learning research, 3(Jan), 993–1022.
48  *
49  * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
50  * algorithms for topic models. Journal of Machine Learning Research,
51  * 10 (Aug), 1801–1828.
52  *
53  * If you use automated topic labeling, please cite:
54  *
55  * Mei, Q., Shen, X., & Zhai, C. (2007). Automatic labeling of multinomial
56  * topic models. In Proceedings of the 13th ACM SIGKDD International
57  * Conference on Knowledge Discovery and Data Mining, 490–499.
58  *
59  * Created on: Feb 5, 2021
60  * Author: ans
61  */
62 
63 #ifndef MODULE_ANALYZER_ALGO_TOPICMODELLING_HPP_
64 #define MODULE_ANALYZER_ALGO_TOPICMODELLING_HPP_
65 
66 #include "../Thread.hpp"
67 
68 #include "../../../Data/Corpus.hpp"
69 #include "../../../Data/Data.hpp"
70 #include "../../../Data/TopicModel.hpp"
71 #include "../../../Helper/CommaLocale.hpp"
72 #include "../../../Helper/DotLocale.hpp"
73 #include "../../../Helper/FileSystem.hpp"
74 #include "../../../Helper/Math.hpp"
75 #include "../../../Helper/Memory.hpp"
76 #include "../../../Helper/Queue.hpp"
77 #include "../../../Main/Database.hpp"
78 #include "../../../Struct/StatusSetter.hpp"
79 #include "../../../Struct/TableColumn.hpp"
80 #include "../../../Struct/TextMap.hpp"
81 #include "../../../Struct/ThreadOptions.hpp"
82 #include "../../../Struct/ThreadStatus.hpp"
83 #include "../../../Timer/Simple.hpp"
84 
85 #include <algorithm> // std::count_if, std::max_element, std::min, std::sort
86 #include <cstddef> // std::size_t
87 #include <cstdint> // std::uint8_t, std::uint16_t, std::uint64_t
88 #include <iomanip> // std::setprecision
89 #include <ios> // std::fixed
90 #include <limits> // std::numeric_limits
91 #include <queue> // std::queue
92 #include <random> // std::random_device
93 #include <sstream> // std::ostringstream
94 #include <string> // std::string, std::to_string
95 #include <string_view> // std::string_view, std::string_view_literals
96 #include <unordered_map> // std::unordered_map
97 #include <unordered_set> // std::unordered_set
98 #include <utility> // std::pair
99 #include <vector> // std::vector
100 
102 
103  using std::string_view_literals::operator""sv;
104 
105  /*
106  * CONSTANTS
107  */
108 
111 
113  inline constexpr auto topicModellingDirectory{"mdl"sv};
114 
116 
121  inline constexpr auto topicModellingDefaultNumberOfTopics{2};
122 
124 
130 
132 
136  inline constexpr auto topicModellingDefaultBurnIn{100};
137 
139  inline constexpr auto topicModellingDefaultIterations{1000};
140 
142  inline constexpr auto topicModellingDefaultIterationsAtOnce{25};
143 
145  inline constexpr auto topicModellingDefaultMinCf{1};
146 
148  inline constexpr auto topicModellingDefaultMinDf{1};
149 
151  inline constexpr auto topicModellingDefaultOptimizeEvery{10};
152 
154  inline constexpr auto topicModellingDefaultRemoveTopN{0};
155 
157  inline constexpr auto topicModellingDefaultNumberOfThreads{1};
158 
160  inline constexpr auto topicModellingDefaultAlpha{0.1F};
161 
163  inline constexpr auto topicModellingDefaultConversionThreshold{0.F};
164 
166  inline constexpr auto topicModellingDefaultEta{0.01F};
167 
169 
174  inline constexpr auto topicModellingDefaultGamma{0.1F};
175 
177  inline constexpr auto topicModellingDefaultDocIterations{100};
178 
180  inline constexpr auto topicModellingDefaultNumberOfWorkers{0};
181 
183  inline constexpr auto topicModellingDefaultMinLabelCf{1};
184 
186  inline constexpr auto topicModellingDefaultMinLabelDf{1};
187 
189  inline constexpr auto topicModellingDefaultMinLabelLength{2};
190 
192  inline constexpr auto topicModellingDefaultMaxLabelLength{5};
193 
195  inline constexpr auto topicModellingDefaultMaxLabelCandidates{10000};
196 
198  inline constexpr auto topicModellingDefaultLabelSmoothing{.1F};
199 
201  inline constexpr auto topicModellingDefaultLabelMu{.25F};
202 
204  inline constexpr auto topicModellingUpdateProgressEvery{1000};
205 
207  inline constexpr auto topicModellingUpdateProgressEveryDocs{25};
208 
210  inline constexpr auto topicModellingPrecisionLL{6};
211 
213  inline constexpr auto topicModellingTargetColumns{2};
214 
216  inline constexpr auto topicModellingTopicColumns{2};
217 
219  inline constexpr auto topicModellingColumnsPerLabel{2};
220 
222  inline constexpr auto topicModellingColumnsPerToken{2};
223 
225  inline constexpr auto topicModellingPrecisionUlp{5};
226 
228 
229  /*
230  * DECLARATION
231  */
232 
234 
264  // for convenience
266 
267  using Corpus = Data::Corpus;
269 
272  using TextMap = Struct::TextMap;
276 
277  using StringString = std::pair<std::string, std::string>;
278 
279  public:
282 
284  Main::Database& dbBase,
285  const ThreadOptions& threadOptions,
286  const ThreadStatus& threadStatus
287  );
289  Main::Database& dbBase,
290  const ThreadOptions& threadOptions
291  );
292 
296 
297  std::string_view getName() const override;
298 
302 
303  void onAlgoInitTarget() override;
304  void onAlgoInit() override;
305  void onAlgoTick() override;
306  void onAlgoPause() override;
307  void onAlgoUnpause() override;
308  void onAlgoClear() override;
309 
313 
314  void parseAlgoOption() override;
315  void checkAlgoOptions() override;
316  void resetAlgo() override;
317 
319 
320  private:
321  // algorithm options
322  struct Entries {
323  // general
324  std::uint16_t initialNumberOfTopics{topicModellingDefaultNumberOfTopics};
325  bool isNumberOfTopicsFixed{false};
326  std::string topicTable;
327  std::uint16_t numberOfTopicTokens{topicModellingDefaultNumberOfTopicTokens};
328 
329  // training
330  std::uint64_t burnIn{topicModellingDefaultBurnIn};
331  bool idf{false};
332  std::uint16_t iterations{topicModellingDefaultIterations};
333  std::uint16_t iterationsAtOnce{topicModellingDefaultIterationsAtOnce};
334  std::uint16_t minCf{topicModellingDefaultMinCf};
335  std::uint16_t minDf{topicModellingDefaultMinDf};
336  std::uint16_t optimizeEvery{topicModellingDefaultOptimizeEvery};
337  std::size_t removeTopN{topicModellingDefaultRemoveTopN};
338  std::uint16_t threads{topicModellingDefaultNumberOfThreads};
339 
340  // model
341  float alpha{topicModellingDefaultAlpha};
342  float conversionThreshold{topicModellingDefaultConversionThreshold};
343  bool isContinue{false};
344  float eta{topicModellingDefaultEta};
345  float gamma{topicModellingDefaultGamma};
346  std::uint16_t docIterations{topicModellingDefaultDocIterations};
347  std::string load;
348  std::string save;
349  bool saveFull{false};
350  std::size_t seed{};
351  std::uint16_t workers{topicModellingDefaultNumberOfWorkers};
352 
353  // labeling
354  std::size_t labelNumber{};
355  std::uint16_t labelMinCf{topicModellingDefaultMinLabelCf};
356  std::uint16_t labelMinDf{topicModellingDefaultMinLabelDf};
357  std::uint8_t labelMinLength{topicModellingDefaultMinLabelLength};
358  std::uint8_t labelMaxLength{topicModellingDefaultMaxLabelLength};
359  std::uint64_t labelMaxCandidates{topicModellingDefaultMaxLabelCandidates};
360  float labelSmoothing{topicModellingDefaultLabelSmoothing};
361  float labelMu{topicModellingDefaultLabelMu};
362  std::uint64_t labelWindowSize{};
363  } algoConfig;
364 
365  // topic model
366  TopicModel model;
367 
368  // algorithm state
369  Timer::Simple timer;
370 
371  bool firstTick{true};
372  bool isTrained{false};
373 
374  std::size_t iteration{};
375 
376  // second target table
377  std::size_t topicTable{};
378 
379  // results
380  std::unordered_set<std::string> articlesDone;
381  std::vector<std::pair<std::string, std::vector<float>>> results;
382  std::unordered_map<
383  std::size_t,
384  std::vector<std::pair<std::string, float>>
385  > labels;
386 
387  // algorithm functions
388  void initModel();
389  void getCorpus(StatusSetter& statusSetter);
390  void loadModel(StatusSetter& statusSetter);
391  void addArticles(StatusSetter& statusSetter);
392  void startTraining(StatusSetter& statusSetter);
393  void trainModel();
394  void saveModel(StatusSetter& statusSetter);
395  void classifyArticles(StatusSetter& statusSetter);
396  void labelTopics(StatusSetter& statusSetter);
397  void saveData(StatusSetter& statusSetter);
398 
399  // internal helper functions
400  void initTopicTable();
401  void logLoading(const std::string& name);
402  void logLoad(const std::string& name, const std::string& time, std::size_t size);
403  void logModelInfo();
404  void updateTrainingStatus(float ll, std::size_t k);
405  void logTrainingTick(float ll, std::size_t k);
406  void logTrainingTime();
407  void logSaving(const std::string& name, bool full);
408  void logSave(const std::string& name, const std::string& time, std::size_t size);
409  void finishUp();
410  void classifyQueue(std::queue<std::string>& toClassify, StatusSetter& statusSetter);
411  void saveArticleData(StatusSetter& statusSetter);
412  void saveTopicData(StatusSetter& statusSetter);
413 
414  [[nodiscard]] Data::InsertFieldsMixed getTopicData(
415  const std::string& tableName,
416  const std::pair<std::size_t, std::size_t>& topic
417  ) const;
418  [[nodiscard]] std::string getArticleTopDescription(
419  const std::vector<float>& probabilities,
420  const std::vector<std::size_t>& topics
421  ) const;
422  [[nodiscard]] std::string getTopicDescription(
423  std::size_t topicId
424  ) const;
425 
426  // static internal helper functions
427  static void initKnownTopics(
428  std::vector<StringString>& fieldsTo,
429  std::uint16_t numberOfTopics
430  );
431  static void initUnknownTopics(std::vector<StringString>& fieldsTo);
432  static void initArticleColumns(std::vector<StringString>& fieldsTo);
433 
434  [[nodiscard]] static std::string modelFile(const std::string& name);
435 
436  static void addTopicColumns(
437  Database& db,
438  const std::string& targetTableName,
439  std::uint16_t numberOfTopics
440  );
441 
442  [[nodiscard]] static std::queue<std::string> getArticlesToClassify(
443  const TextMap& articleMap,
444  std::unordered_set<std::string>& done
445  );
446  [[nodiscard]] static Data::InsertFieldsMixed getArticleData(
447  const std::string& tableName,
448  std::size_t numberOfColumns,
449  const std::pair<std::string, std::vector<float>>& articleClassification,
450  const std::string& top
451  );
452 
453  static void getNArticlesFromQueue(
454  std::size_t n,
455  std::queue<std::string>& from,
456  const Corpus& corpus,
457  std::vector<std::string>& namesTo,
458  std::vector<std::vector<std::string>>& tokensTo
459  );
460  static void topicsToResults(
461  std::size_t n,
462  const std::vector<std::string>& names,
463  const std::vector<std::vector<float>>& topics,
464  std::vector<std::pair<std::string, std::vector<float>>>& to
465  );
466  };
467 
468 } /* namespace crawlservpp::Module::Analyzer::Algo */
469 
470 #endif /* MODULE_ANALYZER_ALGO_TOPICMODELLING_HPP_ */
constexpr auto topicModellingTargetColumns
The number of additional columns in the target table.
Definition: TopicModelling.hpp:213
Topic Modeller.
Definition: TopicModelling.hpp:263
void checkAlgoOptions() override
Checks the configuration options for the algorithm.
Definition: TopicModelling.cpp:282
constexpr auto topicModellingDefaultMinLabelDf
The default number of a topic label&#39;s minimum document frequency.
Definition: TopicModelling.hpp:186
constexpr auto topicModellingDefaultNumberOfWorkers
The default number of worker threads for infering the topics of articles.
Definition: TopicModelling.hpp:180
std::string_view getName() const override
Returns the name of the algorithm.
Definition: TopicModelling.cpp:106
Namespace for algorithm classes.
Definition: All.cpp:52
constexpr auto topicModellingDefaultIterations
The default number of iterations to train the model.
Definition: TopicModelling.hpp:139
TopicModelling(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: TopicModelling.cpp:75
constexpr auto topicModellingDefaultNumberOfTopicTokens
The default number of most-probable tokens for each detected topic.
Definition: TopicModelling.hpp:129
constexpr auto topicModellingPrecisionLL
The number of digits of the log-likelihood to be logged.
Definition: TopicModelling.hpp:210
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void resetAlgo() override
Resets the algorithm.
Definition: TopicModelling.cpp:306
constexpr auto topicModellingDefaultMinCf
The default number of a token&#39;s minimum frequency in the corpus.
Definition: TopicModelling.hpp:145
void onAlgoTick() override
Performs a number of training iterations, if necessary.
Definition: TopicModelling.cpp:191
void onAlgoInit() override
Initializes the algorithm and processes its input.
Definition: TopicModelling.cpp:151
constexpr auto topicModellingDefaultRemoveTopN
The default number of most-common tokens to ignore.
Definition: TopicModelling.hpp:154
Text map entry.
Definition: TextMap.hpp:49
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
constexpr auto topicModellingDefaultGamma
The default initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelling.hpp:174
constexpr auto topicModellingUpdateProgressEvery
The number of added/saved articles after which the progress will be updated.
Definition: TopicModelling.hpp:204
constexpr auto topicModellingDefaultMinLabelLength
The default minimum length of topic labels, in tokens.
Definition: TopicModelling.hpp:189
Class representing a text corpus.
Definition: Corpus.hpp:165
constexpr auto topicModellingColumnsPerLabel
The number of columns per top label.
Definition: TopicModelling.hpp:219
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
constexpr auto topicModellingDefaultMaxLabelCandidates
The default maximum number of topic label candidates to be extracted from the training data...
Definition: TopicModelling.hpp:195
Structure for inserting multiple values of different types into a row.
Definition: Data.hpp:360
Structure for table columns containing its name, type, reference, and indexing.
Definition: TableColumn.hpp:39
constexpr auto topicModellingDefaultEta
The default initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelling.hpp:166
constexpr auto topicModellingDefaultConversionThreshold
The default threshold for topics to be included when converting a HDP to a LDA model.
Definition: TopicModelling.hpp:163
constexpr auto topicModellingDefaultMaxLabelLength
The default maximum length of topic labels, in tokens.
Definition: TopicModelling.hpp:192
constexpr auto topicModellingUpdateProgressEveryDocs
The number of classified documents after which the progress will be updated.
Definition: TopicModelling.hpp:207
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
A simple timer.
Definition: Simple.hpp:53
constexpr auto topicModellingDefaultOptimizeEvery
The default optimization interval for the model parameters, in training iterations.
Definition: TopicModelling.hpp:151
constexpr auto topicModellingDefaultNumberOfThreads
The default number of threads for training the model.
Definition: TopicModelling.hpp:157
Class providing database functionality for analyzer threads by implementing Wrapper::Database.
Definition: Database.hpp:188
constexpr auto topicModellingDefaultMinLabelCf
The default number of a topic label&#39;s minimum frequency in the corpus.
Definition: TopicModelling.hpp:183
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: TopicModelling.cpp:120
constexpr auto topicModellingDefaultBurnIn
The default number of burn-in iterations.
Definition: TopicModelling.hpp:136
constexpr auto topicModellingDefaultIterationsAtOnce
The default number of iterations to train the model at once.
Definition: TopicModelling.hpp:142
constexpr auto topicModellingDefaultNumberOfTopics
The default number of initial topics.
Definition: TopicModelling.hpp:121
constexpr auto topicModellingTopicColumns
The number of additional columns in the topic table.
Definition: TopicModelling.hpp:216
constexpr auto topicModellingDefaultLabelMu
The default discriminative coefficient for the automated detection of topic labels.
Definition: TopicModelling.hpp:201
constexpr auto topicModellingDefaultAlpha
The default initial hyperparameter for the Dirichlet distribution for document–table.
Definition: TopicModelling.hpp:160
void onAlgoUnpause() override
Does nothing.
Definition: TopicModelling.cpp:221
Topic modeller.
Definition: TopicModel.hpp:257
constexpr auto topicModellingDefaultLabelSmoothing
The default Laplace smoothing for the automated detection of topic labels.
Definition: TopicModelling.hpp:198
constexpr auto topicModellingDirectory
The directory for model files.
Definition: TopicModelling.hpp:113
constexpr auto topicModellingPrecisionUlp
Precision used when testing topic probabilities for equality, in ULPs (units in the last place)...
Definition: TopicModelling.hpp:225
constexpr auto topicModellingDefaultDocIterations
The default number of maximum iterations to classify a document.
Definition: TopicModelling.hpp:177
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: TopicModelling.cpp:231
constexpr auto topicModellingDefaultMinDf
The default number of a token&#39;s minimum document frequency.
Definition: TopicModelling.hpp:148
void onAlgoClear() override
Does nothing.
Definition: TopicModelling.cpp:224
void onAlgoPause() override
Does nothing.
Definition: TopicModelling.cpp:218
constexpr auto topicModellingColumnsPerToken
The number of columns per top token.
Definition: TopicModelling.hpp:222