|
crawlserv++
[under development]
Application for crawling and analyzing textual content of websites.
|
#include "../Thread.hpp"#include "../../../Data/Corpus.hpp"#include "../../../Data/Data.hpp"#include "../../../Data/TopicModel.hpp"#include "../../../Helper/CommaLocale.hpp"#include "../../../Helper/DotLocale.hpp"#include "../../../Helper/FileSystem.hpp"#include "../../../Helper/Math.hpp"#include "../../../Helper/Memory.hpp"#include "../../../Helper/Queue.hpp"#include "../../../Main/Database.hpp"#include "../../../Struct/StatusSetter.hpp"#include "../../../Struct/TableColumn.hpp"#include "../../../Struct/TextMap.hpp"#include "../../../Struct/ThreadOptions.hpp"#include "../../../Struct/ThreadStatus.hpp"#include "../../../Timer/Simple.hpp"#include <algorithm>#include <cstddef>#include <cstdint>#include <iomanip>#include <ios>#include <limits>#include <queue>#include <random>#include <sstream>#include <string>#include <string_view>#include <unordered_map>#include <unordered_set>#include <utility>#include <vector>

Go to the source code of this file.
Classes | |
| class | crawlservpp::Module::Analyzer::Algo::TopicModelling |
| Topic Modeller. More... | |
Namespaces | |
| crawlservpp::Module::Analyzer::Algo | |
| Namespace for algorithm classes. | |
Constants | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDirectory {"mdl"sv} |
| The directory for model files. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfTopics {2} |
| The default number of initial topics. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfTopicTokens {5} |
| The default number of most-probable tokens for each detected topic. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultBurnIn {100} |
| The default number of burn-in iterations. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultIterations {1000} |
| The default number of iterations to train the model. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultIterationsAtOnce {25} |
| The default number of iterations to train the model at once. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinCf {1} |
| The default number of a token's minimum frequency in the corpus. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinDf {1} |
| The default number of a token's minimum document frequency. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultOptimizeEvery {10} |
| The default optimization interval for the model parameters, in training iterations. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultRemoveTopN {0} |
| The default number of most-common tokens to ignore. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfThreads {1} |
| The default number of threads for training the model. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultAlpha {0.1F} |
| The default initial hyperparameter for the Dirichlet distribution for document–table. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultConversionThreshold {0.F} |
| The default threshold for topics to be included when converting a HDP to a LDA model. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultEta {0.01F} |
| The default initial hyperparameter for the Dirichlet distribution for topic–token. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultGamma {0.1F} |
| The default initial concentration coefficient of the Dirichlet Process for table–topic. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultDocIterations {100} |
| The default number of maximum iterations to classify a document. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultNumberOfWorkers {0} |
| The default number of worker threads for infering the topics of articles. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinLabelCf {1} |
| The default number of a topic label's minimum frequency in the corpus. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinLabelDf {1} |
| The default number of a topic label's minimum document frequency. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMinLabelLength {2} |
| The default minimum length of topic labels, in tokens. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMaxLabelLength {5} |
| The default maximum length of topic labels, in tokens. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultMaxLabelCandidates {10000} |
| The default maximum number of topic label candidates to be extracted from the training data. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultLabelSmoothing {.1F} |
| The default Laplace smoothing for the automated detection of topic labels. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingDefaultLabelMu {.25F} |
| The default discriminative coefficient for the automated detection of topic labels. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingUpdateProgressEvery {1000} |
| The number of added/saved articles after which the progress will be updated. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingUpdateProgressEveryDocs {25} |
| The number of classified documents after which the progress will be updated. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingPrecisionLL {6} |
| The number of digits of the log-likelihood to be logged. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingTargetColumns {2} |
| The number of additional columns in the target table. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingTopicColumns {2} |
| The number of additional columns in the topic table. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingColumnsPerLabel {2} |
| The number of columns per top label. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingColumnsPerToken {2} |
| The number of columns per top token. More... | |
| constexpr auto | crawlservpp::Module::Analyzer::Algo::topicModellingPrecisionUlp {5} |
| Precision used when testing topic probabilities for equality, in ULPs (units in the last place). More... | |