63 #ifndef MODULE_ANALYZER_ALGO_TOPICMODELLING_HPP_ 64 #define MODULE_ANALYZER_ALGO_TOPICMODELLING_HPP_ 66 #include "../Thread.hpp" 68 #include "../../../Data/Corpus.hpp" 69 #include "../../../Data/Data.hpp" 70 #include "../../../Data/TopicModel.hpp" 71 #include "../../../Helper/CommaLocale.hpp" 72 #include "../../../Helper/DotLocale.hpp" 73 #include "../../../Helper/FileSystem.hpp" 74 #include "../../../Helper/Math.hpp" 75 #include "../../../Helper/Memory.hpp" 76 #include "../../../Helper/Queue.hpp" 77 #include "../../../Main/Database.hpp" 78 #include "../../../Struct/StatusSetter.hpp" 79 #include "../../../Struct/TableColumn.hpp" 80 #include "../../../Struct/TextMap.hpp" 81 #include "../../../Struct/ThreadOptions.hpp" 82 #include "../../../Struct/ThreadStatus.hpp" 83 #include "../../../Timer/Simple.hpp" 95 #include <string_view> 96 #include <unordered_map> 97 #include <unordered_set> 103 using std::string_view_literals::operator
""sv;
277 using StringString = std::pair<std::string, std::string>;
297 std::string_view
getName()
const override;
325 bool isNumberOfTopicsFixed{
false};
326 std::string topicTable;
343 bool isContinue{
false};
349 bool saveFull{
false};
354 std::size_t labelNumber{};
362 std::uint64_t labelWindowSize{};
371 bool firstTick{
true};
372 bool isTrained{
false};
374 std::size_t iteration{};
377 std::size_t topicTable{};
380 std::unordered_set<std::string> articlesDone;
381 std::vector<std::pair<std::string, std::vector<float>>> results;
384 std::vector<std::pair<std::string, float>>
400 void initTopicTable();
401 void logLoading(
const std::string& name);
402 void logLoad(
const std::string& name,
const std::string& time, std::size_t size);
404 void updateTrainingStatus(
float ll, std::size_t k);
405 void logTrainingTick(
float ll, std::size_t k);
406 void logTrainingTime();
407 void logSaving(
const std::string& name,
bool full);
408 void logSave(
const std::string& name,
const std::string& time, std::size_t size);
410 void classifyQueue(std::queue<std::string>& toClassify,
StatusSetter& statusSetter);
415 const std::string& tableName,
416 const std::pair<std::size_t, std::size_t>& topic
418 [[nodiscard]] std::string getArticleTopDescription(
419 const std::vector<float>& probabilities,
420 const std::vector<std::size_t>& topics
422 [[nodiscard]] std::string getTopicDescription(
427 static void initKnownTopics(
428 std::vector<StringString>& fieldsTo,
429 std::uint16_t numberOfTopics
431 static void initUnknownTopics(std::vector<StringString>& fieldsTo);
432 static void initArticleColumns(std::vector<StringString>& fieldsTo);
434 [[nodiscard]]
static std::string modelFile(
const std::string& name);
436 static void addTopicColumns(
438 const std::string& targetTableName,
439 std::uint16_t numberOfTopics
442 [[nodiscard]]
static std::queue<std::string> getArticlesToClassify(
443 const TextMap& articleMap,
444 std::unordered_set<std::string>& done
447 const std::string& tableName,
448 std::size_t numberOfColumns,
449 const std::pair<std::string, std::vector<float>>& articleClassification,
450 const std::string& top
453 static void getNArticlesFromQueue(
455 std::queue<std::string>& from,
457 std::vector<std::string>& namesTo,
458 std::vector<std::vector<std::string>>& tokensTo
460 static void topicsToResults(
462 const std::vector<std::string>& names,
463 const std::vector<std::vector<float>>& topics,
464 std::vector<std::pair<std::string, std::vector<float>>>& to
constexpr auto topicModellingTargetColumns
The number of additional columns in the target table.
Definition: TopicModelling.hpp:213
Topic Modeller.
Definition: TopicModelling.hpp:263
void checkAlgoOptions() override
Checks the configuration options for the algorithm.
Definition: TopicModelling.cpp:282
constexpr auto topicModellingDefaultMinLabelDf
The default number of a topic label's minimum document frequency.
Definition: TopicModelling.hpp:186
constexpr auto topicModellingDefaultNumberOfWorkers
The default number of worker threads for infering the topics of articles.
Definition: TopicModelling.hpp:180
std::string_view getName() const override
Returns the name of the algorithm.
Definition: TopicModelling.cpp:106
Namespace for algorithm classes.
Definition: All.cpp:52
constexpr auto topicModellingDefaultIterations
The default number of iterations to train the model.
Definition: TopicModelling.hpp:139
TopicModelling(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: TopicModelling.cpp:75
constexpr auto topicModellingDefaultNumberOfTopicTokens
The default number of most-probable tokens for each detected topic.
Definition: TopicModelling.hpp:129
constexpr auto topicModellingPrecisionLL
The number of digits of the log-likelihood to be logged.
Definition: TopicModelling.hpp:210
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void resetAlgo() override
Resets the algorithm.
Definition: TopicModelling.cpp:306
constexpr auto topicModellingDefaultMinCf
The default number of a token's minimum frequency in the corpus.
Definition: TopicModelling.hpp:145
void onAlgoTick() override
Performs a number of training iterations, if necessary.
Definition: TopicModelling.cpp:191
void onAlgoInit() override
Initializes the algorithm and processes its input.
Definition: TopicModelling.cpp:151
constexpr auto topicModellingDefaultRemoveTopN
The default number of most-common tokens to ignore.
Definition: TopicModelling.hpp:154
Text map entry.
Definition: TextMap.hpp:49
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
constexpr auto topicModellingDefaultGamma
The default initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelling.hpp:174
constexpr auto topicModellingUpdateProgressEvery
The number of added/saved articles after which the progress will be updated.
Definition: TopicModelling.hpp:204
constexpr auto topicModellingDefaultMinLabelLength
The default minimum length of topic labels, in tokens.
Definition: TopicModelling.hpp:189
Class representing a text corpus.
Definition: Corpus.hpp:165
constexpr auto topicModellingColumnsPerLabel
The number of columns per top label.
Definition: TopicModelling.hpp:219
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
constexpr auto topicModellingDefaultMaxLabelCandidates
The default maximum number of topic label candidates to be extracted from the training data...
Definition: TopicModelling.hpp:195
Structure for inserting multiple values of different types into a row.
Definition: Data.hpp:360
Structure for table columns containing its name, type, reference, and indexing.
Definition: TableColumn.hpp:39
constexpr auto topicModellingDefaultEta
The default initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelling.hpp:166
constexpr auto topicModellingDefaultConversionThreshold
The default threshold for topics to be included when converting a HDP to a LDA model.
Definition: TopicModelling.hpp:163
constexpr auto topicModellingDefaultMaxLabelLength
The default maximum length of topic labels, in tokens.
Definition: TopicModelling.hpp:192
constexpr auto topicModellingUpdateProgressEveryDocs
The number of classified documents after which the progress will be updated.
Definition: TopicModelling.hpp:207
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
A simple timer.
Definition: Simple.hpp:53
constexpr auto topicModellingDefaultOptimizeEvery
The default optimization interval for the model parameters, in training iterations.
Definition: TopicModelling.hpp:151
constexpr auto topicModellingDefaultNumberOfThreads
The default number of threads for training the model.
Definition: TopicModelling.hpp:157
Class providing database functionality for analyzer threads by implementing Wrapper::Database.
Definition: Database.hpp:188
constexpr auto topicModellingDefaultMinLabelCf
The default number of a topic label's minimum frequency in the corpus.
Definition: TopicModelling.hpp:183
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: TopicModelling.cpp:120
constexpr auto topicModellingDefaultBurnIn
The default number of burn-in iterations.
Definition: TopicModelling.hpp:136
constexpr auto topicModellingDefaultIterationsAtOnce
The default number of iterations to train the model at once.
Definition: TopicModelling.hpp:142
constexpr auto topicModellingDefaultNumberOfTopics
The default number of initial topics.
Definition: TopicModelling.hpp:121
constexpr auto topicModellingTopicColumns
The number of additional columns in the topic table.
Definition: TopicModelling.hpp:216
constexpr auto topicModellingDefaultLabelMu
The default discriminative coefficient for the automated detection of topic labels.
Definition: TopicModelling.hpp:201
constexpr auto topicModellingDefaultAlpha
The default initial hyperparameter for the Dirichlet distribution for document–table.
Definition: TopicModelling.hpp:160
void onAlgoUnpause() override
Does nothing.
Definition: TopicModelling.cpp:221
Topic modeller.
Definition: TopicModel.hpp:257
constexpr auto topicModellingDefaultLabelSmoothing
The default Laplace smoothing for the automated detection of topic labels.
Definition: TopicModelling.hpp:198
constexpr auto topicModellingDirectory
The directory for model files.
Definition: TopicModelling.hpp:113
constexpr auto topicModellingPrecisionUlp
Precision used when testing topic probabilities for equality, in ULPs (units in the last place)...
Definition: TopicModelling.hpp:225
constexpr auto topicModellingDefaultDocIterations
The default number of maximum iterations to classify a document.
Definition: TopicModelling.hpp:177
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: TopicModelling.cpp:231
constexpr auto topicModellingDefaultMinDf
The default number of a token's minimum document frequency.
Definition: TopicModelling.hpp:148
void onAlgoClear() override
Does nothing.
Definition: TopicModelling.cpp:224
void onAlgoPause() override
Does nothing.
Definition: TopicModelling.cpp:218
constexpr auto topicModellingColumnsPerToken
The number of columns per top token.
Definition: TopicModelling.hpp:222