crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TopicModelInfo.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * TopicModelInfo.hpp
24  *
25  * Structure with information about
26  * Hierarchical Dirichlet Process (HDP) models.
27  *
28  * Using tomoto, the underlying C++ API of tomotopy, see:
29  * https://bab2min.github.io/tomotopy/
30  *
31  * If you use the HDP topic modelling algorithm, please cite:
32  *
33  * Teh, Y. W., Jordan, M. I., Beal, M. J., & Blei, D. M. (2005). Sharing
34  * clusters among related groups: Hierarchical Dirichlet processes.
35  * In Advances in neural information processing systems, 1385–1392.
36  *
37  * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
38  * algorithms for topic models. Journal of Machine Learning Research,
39  * 10 (Aug), 1801–1828.
40  *
41  * If you use the LDA topic modelling algorithm, please cite:
42  *
43  * Blei, D. M., Ng, A. Y., & Jordan, M. I. (2003). Latent dirichlet
44  * allocation. Journal of machine Learning research, 3(Jan), 993–1022.
45  *
46  * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
47  * algorithms for topic models. Journal of Machine Learning Research,
48  * 10 (Aug), 1801–1828.
49  *
50  * If you use automated topic labeling, please cite:
51  *
52  * Mei, Q., Shen, X., & Zhai, C. (2007). Automatic labeling of multinomial
53  * topic models. In Proceedings of the 13th ACM SIGKDD International
54  * Conference on Knowledge Discovery and Data Mining, 490–499.
55  *
56  * Created on: Feb 2, 2021
57  * Author: ans
58  */
59 
60 #ifndef STRUCT_HDPMODELINFO_HPP_
61 #define STRUCT_HDPMODELINFO_HPP_
62 
63 #include <cstddef> // std::size_t
64 #include <queue> // std::queue
65 #include <sstream> // std::ostringstream
66 #include <string> // std::string, std::to_string
67 #include <vector> // std::vector
68 
69 namespace crawlservpp::Struct {
70 
72  struct TopicModelInfo {
75 
77  std::string modelName;
78 
80  std::string modelVersion;
81 
83  std::size_t numberOfDocuments{};
84 
86  std::size_t numberOfTokens{};
87 
88  // The number of unique tokens in the model.
89  std::size_t sizeOfVocabulary{};
90 
91  // The number of unique tokens used in the model.
92  std::size_t sizeOfVocabularyUsed{};
93 
95  double tokenEntropy{};
96 
98  std::vector<std::string> removedTokens;
99 
103 
105  std::size_t numberOfIterations{};
106 
108  std::size_t numberOfBurnInSteps{};
109 
111  std::size_t optimizationInterval{};
112 
115 
119 
121  std::string weighting;
122 
124  std::size_t minCollectionFrequency{};
125 
127  std::size_t minDocumentFrequency{};
128 
131 
133  std::size_t numberOfInitialTopics{};
134 
136  float initialAlpha{};
137 
139  float initialEta{};
140 
142  float initialGamma{};
143 
145  std::size_t seed{};
146 
148  std::string trainedWithVersion{};
149 
153 
155  float alpha{};
156 
158  std::vector<float> alphas;
159 
161  float eta{};
162 
164 
168  float gamma{};
169 
171  std::size_t numberOfTopics{};
172 
174 
178  std::size_t numberOfTables{};
179 
183 
185  [[nodiscard]] std::queue<std::string> toQueueOfStrings() const {
186  std::queue<std::string> result;
187 
188  result.emplace("<Basic Info>");
189  result.emplace(
190  "| "
191  + this->modelName
192  + " (current version: "
193  + this->modelVersion
194  + ")"
195  );
196  result.emplace(
197  "| "
198  +
199  std::to_string(this->numberOfDocuments)
200  + " docs, "
201  + std::to_string(this->numberOfTokens)
202  + " tokens"
203  );
204  result.emplace(
205  "| Total Vocabs: "
206  + std::to_string(this->sizeOfVocabulary)
207  + ", Used Vocabs: "
208  + std::to_string(this->sizeOfVocabularyUsed)
209  );
210  result.emplace(
211  "| Entropy of tokens: "
212  + std::to_string(this->tokenEntropy)
213  );
214 
215  std::string removed{"| Removed Vocabs:"};
216 
217  if(this->removedTokens.empty()) {
218  removed += " <NA>";
219  }
220  else {
221  for(const auto& token : this->removedTokens) {
222  removed.push_back(' ');
223 
224  removed += token;
225  }
226  }
227 
228  result.emplace(removed);
229  result.emplace("|");
230  result.emplace("<Training Info>");
231  result.emplace(
232  "| Iterations: "
233  + std::to_string(this->numberOfIterations)
234  + ", Burn-in steps: "
235  + std::to_string(this->numberOfBurnInSteps)
236  );
237  result.emplace(
238  "| Optimization Interval: "
239  + std::to_string(this->optimizationInterval)
240  );
241  result.emplace(
242  "| Log-likelihood per token: "
243  + std::to_string(this->logLikelihoodPerToken)
244  );
245  result.emplace("|");
246  result.emplace("<Initial Parameters>");
247  result.emplace("| tw: " + this->weighting);
248  result.emplace(
249  "| min_cf: "
250  + std::to_string(this->minCollectionFrequency)
251  + " (minimum collection frequency of tokens)"
252  );
253  result.emplace(
254  "| min_df: "
255  + std::to_string(this->minDocumentFrequency)
256  + " (minimum document frequency of tokens)"
257  );
258  result.emplace(
259  "| rm_top: "
260  + std::to_string(this->numberOfTopTokensToBeRemoved)
261  + " (the number of top tokens to be removed)"
262  );
263  if(this->numberOfInitialTopics > 0) {
264  result.emplace(
265  "| initial_k: "
266  + std::to_string(this->numberOfInitialTopics)
267  + " (the initial number of topics between 2 ~ 32767,"
268  " which will be adjusted for data during training)"
269  );
270  }
271  else {
272  result.emplace(
273  "| k: "
274  + std::to_string(this->numberOfTopics)
275  + " (the number of topics between 1 ~ 32767)"
276  );
277  }
278  result.emplace(
279  "| alpha: "
280  + std::to_string(this->initialAlpha)
281  + " (concentration coeficient of Dirichlet Process for document-topic)"
282  );
283  result.emplace(
284  "| eta: "
285  + std::to_string(this->initialEta)
286  + " (hyperparameter of Dirichlet distribution for topic-token)"
287  );
288 
289  if(this->initialGamma > 0.) { /* only used by HDP */
290  result.emplace(
291  "| gamma: "
292  + std::to_string(this->initialGamma)
293  + " (concentration coeficient of Dirichlet Process for table-topic)"
294  );
295  }
296 
297  result.emplace(
298  "| seed: "
299  + std::to_string(this->seed)
300  + " (random seed)"
301  );
302  if(!(this->trainedWithVersion.empty())) {
303  result.emplace(
304  "| trained in version " + this->trainedWithVersion
305  );
306  }
307  result.emplace("|");
308  result.emplace("<Parameters>");
309  if(this->alphas.empty()) {
310  result.emplace("| alpha (concentration coeficient of Dirichlet Process for document-table)");
311  result.emplace("| " + std::to_string(this->alpha));
312  }
313  else { /* only used by LDA */
314  result.emplace("| alpha (Dirichlet prior on the per-document topic distributions)");
315 
316  constexpr uint8_t lineBreakAfter{6};
317  std::string line{"| ["};
318  std::uint8_t lineN{};
319 
320  for(const auto a : this->alphas) {
321  if(lineN == lineBreakAfter) {
322  // remove last space and add line
323  line.pop_back();
324 
325  result.emplace(line);
326 
327  line = "| ";
328 
329  lineN = 0;
330  }
331 
332  line += std::to_string(a) + " ";
333 
334  ++lineN;
335  }
336 
337  line.back() = ']';
338 
339  result.emplace(line);
340  }
341  result.emplace("| eta (Dirichlet prior on the per-topic token distribution)");
342  result.emplace("| " + std::to_string(this->eta));
343 
344  if(gamma > 0.) { /* only used by HDP */
345  result.emplace("| gamma (concentration coeficient of Dirichlet Process for table-topic)");
346  result.emplace("| " + std::to_string(this->gamma));
347  }
348 
349  result.emplace("|");
350  result.emplace("| Number of Topics: " + std::to_string(this->numberOfTopics));
351 
352  if(this->numberOfTables > 0) { /* only used by HDP */
353  result.emplace("| Number of Tables: " + std::to_string(this->numberOfTables));
354  }
355 
356  return result;
357  }
358  };
359 
360 } /* namespace crawlservpp::Struct */
361 
362 #endif /* STRUCT_HDPMODELINFO_HPP_ */
std::size_t minCollectionFrequency
Minimum collection frequency of tokens.
Definition: TopicModelInfo.hpp:124
std::queue< std::string > toQueueOfStrings() const
Return queue with strings describing the information contained in the structure.
Definition: TopicModelInfo.hpp:185
std::size_t numberOfBurnInSteps
The number of initially skipped, i.e. burn-in, steps.
Definition: TopicModelInfo.hpp:108
std::size_t numberOfTopics
The number of topics.
Definition: TopicModelInfo.hpp:171
double tokenEntropy
The entropy of tokens in the model.
Definition: TopicModelInfo.hpp:95
std::size_t sizeOfVocabulary
Definition: TopicModelInfo.hpp:89
Structure containing information about the currently trained Hierarchical Dirichlet Process (HDP) mod...
Definition: TopicModelInfo.hpp:72
std::size_t numberOfTopTokensToBeRemoved
The number of top tokens to be removed.
Definition: TopicModelInfo.hpp:130
std::size_t sizeOfVocabularyUsed
Definition: TopicModelInfo.hpp:92
float initialAlpha
The initial concentration coefficient of the Dirichlet Process for document–table.
Definition: TopicModelInfo.hpp:136
std::vector< std::string > removedTokens
The top tokens removed before training.
Definition: TopicModelInfo.hpp:98
float alpha
The concentration coeficient of the Dirichlet Process for document-table (HDP only).
Definition: TopicModelInfo.hpp:155
std::size_t numberOfTokens
The number of tokens in the model.
Definition: TopicModelInfo.hpp:86
std::size_t numberOfIterations
The number of iterations performed.
Definition: TopicModelInfo.hpp:105
std::size_t minDocumentFrequency
Minimum document frequency of tokens.
Definition: TopicModelInfo.hpp:127
std::string modelName
The name of the model.
Definition: TopicModelInfo.hpp:77
std::size_t numberOfTables
The number of tables.
Definition: TopicModelInfo.hpp:178
float initialEta
The initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelInfo.hpp:139
std::string weighting
Term weighting mode as string.
Definition: TopicModelInfo.hpp:121
std::vector< float > alphas
The Dirichlet priors on the per-document topic distributions (LDA only).
Definition: TopicModelInfo.hpp:158
std::size_t seed
The initial seed for random number generation.
Definition: TopicModelInfo.hpp:145
std::size_t numberOfDocuments
The number of documents in the model.
Definition: TopicModelInfo.hpp:83
std::string trainedWithVersion
The version of the modeller the model has been trained with.
Definition: TopicModelInfo.hpp:148
float initialGamma
The initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelInfo.hpp:142
double logLikelihoodPerToken
The log-likelihood per token.
Definition: TopicModelInfo.hpp:114
Namespace for data structures.
Definition: AlgoThreadProperties.hpp:43
float eta
The Dirichlet prior on the per-topic token distribution (HDP only).
Definition: TopicModelInfo.hpp:161
std::string modelVersion
The version of the model (as string).
Definition: TopicModelInfo.hpp:80
float gamma
The concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModelInfo.hpp:168
std::size_t numberOfInitialTopics
The initial number of topics, which will be adjusted for the data during training.
Definition: TopicModelInfo.hpp:133
std::size_t optimizationInterval
The optimization interval.
Definition: TopicModelInfo.hpp:111