60 #ifndef STRUCT_HDPMODELINFO_HPP_ 61 #define STRUCT_HDPMODELINFO_HPP_ 186 std::queue<std::string> result;
188 result.emplace(
"<Basic Info>");
192 +
" (current version: " 211 "| Entropy of tokens: " 215 std::string removed{
"| Removed Vocabs:"};
217 if(this->removedTokens.empty()) {
221 for(
const auto& token : this->removedTokens) {
222 removed.push_back(
' ');
228 result.emplace(removed);
230 result.emplace(
"<Training Info>");
234 +
", Burn-in steps: " 238 "| Optimization Interval: " 242 "| Log-likelihood per token: " 246 result.emplace(
"<Initial Parameters>");
247 result.emplace(
"| tw: " + this->weighting);
251 +
" (minimum collection frequency of tokens)" 256 +
" (minimum document frequency of tokens)" 261 +
" (the number of top tokens to be removed)" 267 +
" (the initial number of topics between 2 ~ 32767," 268 " which will be adjusted for data during training)" 275 +
" (the number of topics between 1 ~ 32767)" 281 +
" (concentration coeficient of Dirichlet Process for document-topic)" 286 +
" (hyperparameter of Dirichlet distribution for topic-token)" 293 +
" (concentration coeficient of Dirichlet Process for table-topic)" 299 + std::to_string(this->
seed)
308 result.emplace(
"<Parameters>");
309 if(this->alphas.empty()) {
310 result.emplace(
"| alpha (concentration coeficient of Dirichlet Process for document-table)");
311 result.emplace(
"| " + std::to_string(this->
alpha));
314 result.emplace(
"| alpha (Dirichlet prior on the per-document topic distributions)");
316 constexpr uint8_t lineBreakAfter{6};
317 std::string line{
"| ["};
318 std::uint8_t lineN{};
320 for(
const auto a : this->alphas) {
321 if(lineN == lineBreakAfter) {
325 result.emplace(line);
332 line += std::to_string(a) +
" ";
339 result.emplace(line);
341 result.emplace(
"| eta (Dirichlet prior on the per-topic token distribution)");
342 result.emplace(
"| " + std::to_string(this->
eta));
345 result.emplace(
"| gamma (concentration coeficient of Dirichlet Process for table-topic)");
346 result.emplace(
"| " + std::to_string(this->
gamma));
350 result.emplace(
"| Number of Topics: " + std::to_string(this->
numberOfTopics));
353 result.emplace(
"| Number of Tables: " + std::to_string(this->
numberOfTables));
std::size_t minCollectionFrequency
Minimum collection frequency of tokens.
Definition: TopicModelInfo.hpp:124
std::queue< std::string > toQueueOfStrings() const
Return queue with strings describing the information contained in the structure.
Definition: TopicModelInfo.hpp:185
std::size_t numberOfBurnInSteps
The number of initially skipped, i.e. burn-in, steps.
Definition: TopicModelInfo.hpp:108
std::size_t numberOfTopics
The number of topics.
Definition: TopicModelInfo.hpp:171
double tokenEntropy
The entropy of tokens in the model.
Definition: TopicModelInfo.hpp:95
std::size_t sizeOfVocabulary
Definition: TopicModelInfo.hpp:89
Structure containing information about the currently trained Hierarchical Dirichlet Process (HDP) mod...
Definition: TopicModelInfo.hpp:72
std::size_t numberOfTopTokensToBeRemoved
The number of top tokens to be removed.
Definition: TopicModelInfo.hpp:130
std::size_t sizeOfVocabularyUsed
Definition: TopicModelInfo.hpp:92
float initialAlpha
The initial concentration coefficient of the Dirichlet Process for document–table.
Definition: TopicModelInfo.hpp:136
std::vector< std::string > removedTokens
The top tokens removed before training.
Definition: TopicModelInfo.hpp:98
float alpha
The concentration coeficient of the Dirichlet Process for document-table (HDP only).
Definition: TopicModelInfo.hpp:155
std::size_t numberOfTokens
The number of tokens in the model.
Definition: TopicModelInfo.hpp:86
std::size_t numberOfIterations
The number of iterations performed.
Definition: TopicModelInfo.hpp:105
std::size_t minDocumentFrequency
Minimum document frequency of tokens.
Definition: TopicModelInfo.hpp:127
std::string modelName
The name of the model.
Definition: TopicModelInfo.hpp:77
std::size_t numberOfTables
The number of tables.
Definition: TopicModelInfo.hpp:178
float initialEta
The initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelInfo.hpp:139
std::string weighting
Term weighting mode as string.
Definition: TopicModelInfo.hpp:121
std::vector< float > alphas
The Dirichlet priors on the per-document topic distributions (LDA only).
Definition: TopicModelInfo.hpp:158
std::size_t seed
The initial seed for random number generation.
Definition: TopicModelInfo.hpp:145
std::size_t numberOfDocuments
The number of documents in the model.
Definition: TopicModelInfo.hpp:83
std::string trainedWithVersion
The version of the modeller the model has been trained with.
Definition: TopicModelInfo.hpp:148
float initialGamma
The initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelInfo.hpp:142
double logLikelihoodPerToken
The log-likelihood per token.
Definition: TopicModelInfo.hpp:114
Namespace for data structures.
Definition: AlgoThreadProperties.hpp:43
float eta
The Dirichlet prior on the per-topic token distribution (HDP only).
Definition: TopicModelInfo.hpp:161
std::string modelVersion
The version of the model (as string).
Definition: TopicModelInfo.hpp:80
float gamma
The concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModelInfo.hpp:168
std::size_t numberOfInitialTopics
The initial number of topics, which will be adjusted for the data during training.
Definition: TopicModelInfo.hpp:133
std::size_t optimizationInterval
The optimization interval.
Definition: TopicModelInfo.hpp:111