crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TopicModel.hpp File Reference
#include "PickleDict.hpp"
#include "../Helper/FileSystem.hpp"
#include "../Helper/Memory.hpp"
#include "../Helper/SilentInclude/EigenRand.h"
#include "../Helper/SilentInclude/tomoto.h"
#include "../Helper/Versions.hpp"
#include "../Main/Exception.hpp"
#include "../Struct/TopicModelInfo.hpp"
#include <algorithm>
#include <array>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <ios>
#include <limits>
#include <memory>
#include <numeric>
#include <random>
#include <string>
#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
Include dependency graph for TopicModel.hpp:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  crawlservpp::Data::TopicModel
 Topic modeller. More...
 
class  crawlservpp::Data::TopicModel::Exception
 Class for topic modelling-specific exceptions. More...
 

Namespaces

 crawlservpp::Data
 Namespace for different types of data.
 

Macros

#define DATA_TOPICMODEL_CALL(isHdp, isIdf, function, ...)
 
#define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function)
 
#define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function, ...)
 
#define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function)
 

Constants

constexpr auto crawlservpp::Data::hdpModelName {"HDPModel"sv}
 The name of the HDP model. More...
 
constexpr auto crawlservpp::Data::ldaModelName {"LDAModel"sv}
 The name of the LDA model. More...
 
constexpr auto crawlservpp::Data::defaultNumberOfInitialTopics {2}
 The initial number of topics by default. More...
 
constexpr auto crawlservpp::Data::defaultAlpha {0.1F}
 The default concentration coeficient of the Dirichlet Process for document-table. More...
 
constexpr auto crawlservpp::Data::defaultEta {0.01F}
 The default hyperparameter for the Dirichlet distribution for topic-token. More...
 
constexpr auto crawlservpp::Data::defaultGamma {0.1F}
 The default concentration coefficient of the Dirichlet Process for table-topic. More...
 
constexpr auto crawlservpp::Data::defaultOptimizationInterval {10}
 The default interval for optimizing the parameters, in iterations. More...
 
constexpr auto crawlservpp::Data::modelFileHead {"LDA\0\0"sv}
 The beginning of a valid model file containing a LDA (or HDP) model. More...
 
constexpr auto crawlservpp::Data::modelFileTermWeightingLen {5}
 The number of bytes determining the term weighting scheme in a model file. More...
 
constexpr auto crawlservpp::Data::modelFileTermWeightingOne {"one\0\0"sv}
 The term weighting scheme ONE as saved in a model file. More...
 
constexpr auto crawlservpp::Data::modelFileTermWeightingIdf {"idf\0\0"sv}
 The term weighting scheme IDF (tf-idf) as saved in a model file. More...
 
constexpr auto crawlservpp::Data::modelFileType {"TPTK"sv}
 The tomoto file format as saved in a model file (after model head and term weighting scheme). More...
 

Macro Definition Documentation

◆ DATA_TOPICMODEL_CALL

#define DATA_TOPICMODEL_CALL (   isHdp,
  isIdf,
  function,
  ... 
)
Value:
if(isHdp) { \
if(isIdf) { \
this->hdpModelIdf->function(__VA_ARGS__); \
} \
else { \
this->hdpModel->function(__VA_ARGS__); \
} \
} \
else { \
if(isIdf) { \
this->ldaModelIdf->function(__VA_ARGS__); \
} \
else { \
this->ldaModel->function(__VA_ARGS__); \
} \
}

Referenced by crawlservpp::Data::TopicModel::addDocument(), crawlservpp::Data::TopicModel::clear(), crawlservpp::Data::TopicModel::getDocumentsTopics(), crawlservpp::Data::TopicModel::load(), crawlservpp::Data::TopicModel::save(), and crawlservpp::Data::TopicModel::setBurnInIteration().

◆ DATA_TOPICMODEL_RETRIEVE

#define DATA_TOPICMODEL_RETRIEVE (   x,
  isHdp,
  isIdf,
  function,
  ... 
)
Value:
if(isHdp) { \
if(isIdf) { \
(x) = this->hdpModelIdf->function(__VA_ARGS__); \
} \
else { \
(x) = this->hdpModel->function(__VA_ARGS__); \
} \
} \
else { \
if(isIdf) { \
(x) = this->ldaModelIdf->function(__VA_ARGS__); \
} \
else { \
(x) = this->ldaModel->function(__VA_ARGS__); \
} \
}

Referenced by crawlservpp::Data::TopicModel::getDocumentId(), crawlservpp::Data::TopicModel::getDocuments(), crawlservpp::Data::TopicModel::getDocumentsTopics(), and crawlservpp::Data::TopicModel::getTopicTopNTokens().

◆ DATA_TOPICMODEL_RETRIEVE_NOARGS

#define DATA_TOPICMODEL_RETRIEVE_NOARGS (   x,
  isHdp,
  isIdf,
  function 
)
Value:
if(isHdp) { \
if(isIdf) { \
(x) = this->hdpModelIdf->function(); \
} \
else { \
(x) = this->hdpModel->function(); \
} \
} \
else { \
if(isIdf) { \
(x) = this->ldaModelIdf->function(); \
} \
else { \
(x) = this->ldaModel->function(); \
} \
}

Referenced by crawlservpp::Data::TopicModel::clear(), crawlservpp::Data::TopicModel::getModelInfo(), crawlservpp::Data::TopicModel::getTokenEntropy(), and crawlservpp::Data::TopicModel::getTopicsSorted().

◆ DATA_TOPICMODEL_RETURN

#define DATA_TOPICMODEL_RETURN (   isHdp,
  isIdf,
  function 
)