|
crawlserv++
[under development]
Application for crawling and analyzing textual content of websites.
|
#include "PickleDict.hpp"#include "../Helper/FileSystem.hpp"#include "../Helper/Memory.hpp"#include "../Helper/SilentInclude/EigenRand.h"#include "../Helper/SilentInclude/tomoto.h"#include "../Helper/Versions.hpp"#include "../Main/Exception.hpp"#include "../Struct/TopicModelInfo.hpp"#include <algorithm>#include <array>#include <cmath>#include <cstdint>#include <cstdlib>#include <fstream>#include <ios>#include <limits>#include <memory>#include <numeric>#include <random>#include <string>#include <string_view>#include <unordered_map>#include <unordered_set>#include <utility>#include <vector>

Go to the source code of this file.
Classes | |
| class | crawlservpp::Data::TopicModel |
| Topic modeller. More... | |
| class | crawlservpp::Data::TopicModel::Exception |
| Class for topic modelling-specific exceptions. More... | |
Namespaces | |
| crawlservpp::Data | |
| Namespace for different types of data. | |
Macros | |
| #define | DATA_TOPICMODEL_CALL(isHdp, isIdf, function, ...) |
| #define | DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function) |
| #define | DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function, ...) |
| #define | DATA_TOPICMODEL_RETURN(isHdp, isIdf, function) |
Constants | |
| constexpr auto | crawlservpp::Data::hdpModelName {"HDPModel"sv} |
| The name of the HDP model. More... | |
| constexpr auto | crawlservpp::Data::ldaModelName {"LDAModel"sv} |
| The name of the LDA model. More... | |
| constexpr auto | crawlservpp::Data::defaultNumberOfInitialTopics {2} |
| The initial number of topics by default. More... | |
| constexpr auto | crawlservpp::Data::defaultAlpha {0.1F} |
| The default concentration coeficient of the Dirichlet Process for document-table. More... | |
| constexpr auto | crawlservpp::Data::defaultEta {0.01F} |
| The default hyperparameter for the Dirichlet distribution for topic-token. More... | |
| constexpr auto | crawlservpp::Data::defaultGamma {0.1F} |
| The default concentration coefficient of the Dirichlet Process for table-topic. More... | |
| constexpr auto | crawlservpp::Data::defaultOptimizationInterval {10} |
| The default interval for optimizing the parameters, in iterations. More... | |
| constexpr auto | crawlservpp::Data::modelFileHead {"LDA\0\0"sv} |
| The beginning of a valid model file containing a LDA (or HDP) model. More... | |
| constexpr auto | crawlservpp::Data::modelFileTermWeightingLen {5} |
| The number of bytes determining the term weighting scheme in a model file. More... | |
| constexpr auto | crawlservpp::Data::modelFileTermWeightingOne {"one\0\0"sv} |
| The term weighting scheme ONE as saved in a model file. More... | |
| constexpr auto | crawlservpp::Data::modelFileTermWeightingIdf {"idf\0\0"sv} |
| The term weighting scheme IDF (tf-idf) as saved in a model file. More... | |
| constexpr auto | crawlservpp::Data::modelFileType {"TPTK"sv} |
| The tomoto file format as saved in a model file (after model head and term weighting scheme). More... | |
| #define DATA_TOPICMODEL_CALL | ( | isHdp, | |
| isIdf, | |||
| function, | |||
| ... | |||
| ) |
Referenced by crawlservpp::Data::TopicModel::addDocument(), crawlservpp::Data::TopicModel::clear(), crawlservpp::Data::TopicModel::getDocumentsTopics(), crawlservpp::Data::TopicModel::load(), crawlservpp::Data::TopicModel::save(), and crawlservpp::Data::TopicModel::setBurnInIteration().
| #define DATA_TOPICMODEL_RETRIEVE | ( | x, | |
| isHdp, | |||
| isIdf, | |||
| function, | |||
| ... | |||
| ) |
Referenced by crawlservpp::Data::TopicModel::getDocumentId(), crawlservpp::Data::TopicModel::getDocuments(), crawlservpp::Data::TopicModel::getDocumentsTopics(), and crawlservpp::Data::TopicModel::getTopicTopNTokens().
| #define DATA_TOPICMODEL_RETRIEVE_NOARGS | ( | x, | |
| isHdp, | |||
| isIdf, | |||
| function | |||
| ) |
Referenced by crawlservpp::Data::TopicModel::clear(), crawlservpp::Data::TopicModel::getModelInfo(), crawlservpp::Data::TopicModel::getTokenEntropy(), and crawlservpp::Data::TopicModel::getTopicsSorted().
| #define DATA_TOPICMODEL_RETURN | ( | isHdp, | |
| isIdf, | |||
| function | |||
| ) |
Referenced by crawlservpp::Data::TopicModel::clear(), crawlservpp::Data::TopicModel::getBurnInIterations(), crawlservpp::Data::TopicModel::getIterations(), crawlservpp::Data::TopicModel::getLogLikelihoodPerToken(), crawlservpp::Data::TopicModel::getNumberOfDocuments(), crawlservpp::Data::TopicModel::getNumberOfTokens(), crawlservpp::Data::TopicModel::getParameterOptimizationInterval(), and crawlservpp::Data::TopicModel::getVocabularySize().