crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TopicModelling.hpp File Reference
#include "../Thread.hpp"
#include "../../../Data/Corpus.hpp"
#include "../../../Data/Data.hpp"
#include "../../../Data/TopicModel.hpp"
#include "../../../Helper/CommaLocale.hpp"
#include "../../../Helper/DotLocale.hpp"
#include "../../../Helper/FileSystem.hpp"
#include "../../../Helper/Math.hpp"
#include "../../../Helper/Memory.hpp"
#include "../../../Helper/Queue.hpp"
#include "../../../Main/Database.hpp"
#include "../../../Struct/StatusSetter.hpp"
#include "../../../Struct/TableColumn.hpp"
#include "../../../Struct/TextMap.hpp"
#include "../../../Struct/ThreadOptions.hpp"
#include "../../../Struct/ThreadStatus.hpp"
#include "../../../Timer/Simple.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <iomanip>
#include <ios>
#include <limits>
#include <queue>
#include <random>
#include <sstream>
#include <string>
#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
Include dependency graph for TopicModelling.hpp:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  crawlservpp::Module::Analyzer::Algo::TopicModelling
 Topic Modeller. More...
 

Namespaces

 crawlservpp::Module::Analyzer::Algo
 Namespace for algorithm classes.
 

Constants

constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDirectory {"mdl"sv}
 The directory for model files. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfTopics {2}
 The default number of initial topics. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfTopicTokens {5}
 The default number of most-probable tokens for each detected topic. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultBurnIn {100}
 The default number of burn-in iterations. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultIterations {1000}
 The default number of iterations to train the model. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultIterationsAtOnce {25}
 The default number of iterations to train the model at once. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinCf {1}
 The default number of a token's minimum frequency in the corpus. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinDf {1}
 The default number of a token's minimum document frequency. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultOptimizeEvery {10}
 The default optimization interval for the model parameters, in training iterations. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultRemoveTopN {0}
 The default number of most-common tokens to ignore. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfThreads {1}
 The default number of threads for training the model. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultAlpha {0.1F}
 The default initial hyperparameter for the Dirichlet distribution for document–table. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultConversionThreshold {0.F}
 The default threshold for topics to be included when converting a HDP to a LDA model. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultEta {0.01F}
 The default initial hyperparameter for the Dirichlet distribution for topic–token. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultGamma {0.1F}
 The default initial concentration coefficient of the Dirichlet Process for table–topic. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultDocIterations {100}
 The default number of maximum iterations to classify a document. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfWorkers {0}
 The default number of worker threads for infering the topics of articles. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinLabelCf {1}
 The default number of a topic label's minimum frequency in the corpus. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinLabelDf {1}
 The default number of a topic label's minimum document frequency. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinLabelLength {2}
 The default minimum length of topic labels, in tokens. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMaxLabelLength {5}
 The default maximum length of topic labels, in tokens. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMaxLabelCandidates {10000}
 The default maximum number of topic label candidates to be extracted from the training data. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultLabelSmoothing {.1F}
 The default Laplace smoothing for the automated detection of topic labels. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingDefaultLabelMu {.25F}
 The default discriminative coefficient for the automated detection of topic labels. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingUpdateProgressEvery {1000}
 The number of added/saved articles after which the progress will be updated. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingUpdateProgressEveryDocs {25}
 The number of classified documents after which the progress will be updated. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingPrecisionLL {6}
 The number of digits of the log-likelihood to be logged. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingTargetColumns {2}
 The number of additional columns in the target table. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingTopicColumns {2}
 The number of additional columns in the topic table. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingColumnsPerLabel {2}
 The number of columns per top label. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingColumnsPerToken {2}
 The number of columns per top token. More...
 
constexpr auto crawlservpp::Module::Analyzer::Algo::topicModellingPrecisionUlp {5}
 Precision used when testing topic probabilities for equality, in ULPs (units in the last place). More...