crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TopicModel.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * TopicModel.hpp
24  *
25  * Topic modeller using the Hierarchical Dirichlet Process (HDP) and
26  * Latent Dirichlet Allocation (LDA) algorithms.
27  *
28  * The former will be used if no fixed number of topics is given,
29  * the latter will be used if a fixed number of topics is given.
30  *
31  * Using tomoto, the underlying C++ API of tomotopy, see:
32  * https://bab2min.github.io/tomotopy/
33  *
34  * If you use the HDP topic modelling algorithm, please cite:
35  *
36  * Teh, Y. W., Jordan, M. I., Beal, M. J., & Blei, D. M. (2005). Sharing
37  * clusters among related groups: Hierarchical Dirichlet processes.
38  * In Advances in neural information processing systems, 1385–1392.
39  *
40  * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
41  * algorithms for topic models. Journal of Machine Learning Research,
42  * 10 (Aug), 1801–1828.
43  *
44  * If you use the LDA topic modelling algorithm, please cite:
45  *
46  * Blei, D. M., Ng, A. Y., & Jordan, M. I. (2003). Latent dirichlet
47  * allocation. Journal of machine Learning research, 3(Jan), 993–1022.
48  *
49  * Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
50  * algorithms for topic models. Journal of Machine Learning Research,
51  * 10 (Aug), 1801–1828.
52  *
53  * If you use automated topic labeling, please cite:
54  *
55  * Mei, Q., Shen, X., & Zhai, C. (2007). Automatic labeling of multinomial
56  * topic models. In Proceedings of the 13th ACM SIGKDD International
57  * Conference on Knowledge Discovery and Data Mining, 490–499.
58  *
59  * Created on: Feb 2, 2021
60  * Author: ans
61  */
62 
63 #ifndef DATA_TOPICMODEL_HPP_
64 #define DATA_TOPICMODEL_HPP_
65 
66 #include "PickleDict.hpp"
67 
68 #include "../Helper/FileSystem.hpp"
69 #include "../Helper/Memory.hpp"
70 #include "../Helper/SilentInclude/EigenRand.h"
71 #include "../Helper/SilentInclude/tomoto.h"
72 #include "../Helper/Versions.hpp"
73 #include "../Main/Exception.hpp"
74 #include "../Struct/TopicModelInfo.hpp"
75 
76 #include <algorithm> // std::transform
77 #include <array> // std::array
78 #include <cmath> // std::log
79 #include <cstdint> // std::uint8_t, std::uint64_t
80 #include <cstdlib> // std::size_t
81 #include <fstream> // std::ifstream, std::ofstream
82 #include <ios> // std::ios
83 #include <limits> // std::numeric_limits
84 #include <memory> // std::make_unique, std::unique_ptr
85 #include <numeric> // std::accumulate
86 #include <random> // std::random_device
87 #include <string> // std::string, std::to_string
88 #include <string_view> // std::string_view, std::string_view_literals
89 #include <unordered_map> // std::unordered_map
90 #include <unordered_set> // std::unordered_set
91 #include <utility> // std::move, std::pair
92 #include <vector> // std::vector
93 
94 // macro for calling member functions of different kinds of (pre-compiled) models
95 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
96 #define DATA_TOPICMODEL_CALL(isHdp, isIdf, function, ...) \
97  if(isHdp) { \
98  if(isIdf) { \
99  this->hdpModelIdf->function(__VA_ARGS__); \
100  } \
101  else { \
102  this->hdpModel->function(__VA_ARGS__); \
103  } \
104  } \
105  else { \
106  if(isIdf) { \
107  this->ldaModelIdf->function(__VA_ARGS__); \
108  } \
109  else { \
110  this->ldaModel->function(__VA_ARGS__); \
111  } \
112  }
113 
114 // macros for retrieving a value from different kinds of (pre-compiled) models
115 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
116 #define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function) \
117  if(isHdp) { \
118  if(isIdf) { \
119  (x) = this->hdpModelIdf->function(); \
120  } \
121  else { \
122  (x) = this->hdpModel->function(); \
123  } \
124  } \
125  else { \
126  if(isIdf) { \
127  (x) = this->ldaModelIdf->function(); \
128  } \
129  else { \
130  (x) = this->ldaModel->function(); \
131  } \
132  }
133 
134 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
135 #define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function, ...) \
136  if(isHdp) { \
137  if(isIdf) { \
138  (x) = this->hdpModelIdf->function(__VA_ARGS__); \
139  } \
140  else { \
141  (x) = this->hdpModel->function(__VA_ARGS__); \
142  } \
143  } \
144  else { \
145  if(isIdf) { \
146  (x) = this->ldaModelIdf->function(__VA_ARGS__); \
147  } \
148  else { \
149  (x) = this->ldaModel->function(__VA_ARGS__); \
150  } \
151  }
152 
153 // macro for returning a value from different kinds of (pre-compiled) models
154 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
155 #define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function) \
156  if(isHdp) { \
157  if(isIdf) { \
158  return this->hdpModelIdf->function(); \
159  } \
160  \
161  return this->hdpModel->function(); \
162  } \
163  \
164  if(isIdf) { \
165  return this->ldaModelIdf->function(); \
166  } \
167  \
168  return this->ldaModel->function();
169 
170 namespace crawlservpp::Data {
171 
172  using std::string_view_literals::operator""sv;
173 
174  /*
175  * CONSTANTS
176  */
177 
180 
182  inline constexpr auto hdpModelName{"HDPModel"sv};
183 
185  inline constexpr auto ldaModelName{"LDAModel"sv};
186 
188  inline constexpr auto defaultNumberOfInitialTopics{2};
189 
191  inline constexpr auto defaultAlpha{0.1F};
192 
194  inline constexpr auto defaultEta{0.01F};
195 
197 
201  inline constexpr auto defaultGamma{0.1F};
202 
204  inline constexpr auto defaultOptimizationInterval{10};
205 
207  inline constexpr auto modelFileHead{"LDA\0\0"sv};
208 
210  inline constexpr auto modelFileTermWeightingLen{5};
211 
213  inline constexpr auto modelFileTermWeightingOne{"one\0\0"sv};
214 
216  inline constexpr auto modelFileTermWeightingIdf{"idf\0\0"sv};
217 
219  inline constexpr auto modelFileType{"TPTK"sv};
220 
222 
223  /*
224  * DECLARATION
225  */
226 
228 
257  class TopicModel {
258  // for convenience
260 
261  using HDPModel = tomoto::HDPModel<tomoto::TermWeight::one, tomoto::RandGen>;
262  using HDPModelIDF = tomoto::HDPModel<tomoto::TermWeight::idf, tomoto::RandGen>;
263  using LDAModel = tomoto::LDAModel<tomoto::TermWeight::one, tomoto::RandGen>;
264  using LDAModelIDF = tomoto::LDAModel<tomoto::TermWeight::idf, tomoto::RandGen>;
265 
266  using FoRelevance = tomoto::label::FoRelevance;
267  using ITopicModel = tomoto::ITopicModel;
268  using PMIExtractor = tomoto::label::PMIExtractor;
269 
270  public:
273 
274  [[nodiscard]] std::size_t getNumberOfDocuments() const;
275  [[nodiscard]] std::unordered_map<std::string, std::size_t> getDocuments() const;
276  [[nodiscard]] std::size_t getVocabularySize() const;
277  [[nodiscard]] std::size_t getOriginalVocabularySize() const;
278  [[nodiscard]] const std::vector<std::string>& getVocabulary() const;
279  [[nodiscard]] std::size_t getNumberOfTokens() const;
280  [[nodiscard]] std::size_t getBurnInIterations() const;
281  [[nodiscard]] std::size_t getIterations() const;
282  [[nodiscard]] std::size_t getParameterOptimizationInterval() const;
283  [[nodiscard]] std::size_t getRandomNumberGenerationSeed() const;
284  [[nodiscard]] std::string_view getModelName() const;
285  [[nodiscard]] std::string_view getTermWeighting() const;
286  [[nodiscard]] std::size_t getDocumentId(const std::string& name) const;
287  [[nodiscard]] std::vector<std::string> getRemovedTokens() const;
288  [[nodiscard]] std::size_t getNumberOfTopics() const;
289  [[nodiscard]] std::vector<std::size_t> getTopics() const;
290  [[nodiscard]] std::vector<std::pair<std::size_t, std::uint64_t>> getTopicsSorted() const;
291  [[nodiscard]] double getLogLikelihoodPerToken() const;
292  [[nodiscard]] double getTokenEntropy() const;
293  [[nodiscard]] std::vector<std::pair<std::string, float>> getTopicTopNTokens(
294  std::size_t topic,
295  std::size_t n
296  ) const;
297  [[nodiscard]] std::vector<std::pair<std::string, float>> getTopicTopNLabels(
298  std::size_t topic,
299  std::size_t n
300  ) const;
301  [[nodiscard]] std::vector<std::pair<std::string, std::vector<float>>> getDocumentsTopics(
302  std::unordered_set<std::string>& done
303  ) const;
304  [[nodiscard]] std::vector<std::vector<float>> getDocumentsTopics(
305  const std::vector<std::vector<std::string>>& documents,
306  std::size_t maxIterations,
307  std::size_t numberOfWorkers
308  ) const;
309  [[nodiscard]] TopicModelInfo getModelInfo() const;
310 
314 
315  void setFixedNumberOfTopics(std::size_t k);
316  void setUseIdf(bool idf);
317  void setBurnInIteration(std::size_t skipIterations);
318  void setTokenRemoval(
319  std::size_t collectionFrequency,
320  std::size_t documentFrequency,
321  std::size_t fixedNumberOfTopTokens
322  );
324  std::size_t initialTopics,
325  float alpha,
326  float eta,
327  float gamma
328  );
329  void setParameterOptimizationInterval(std::size_t interval);
330  void setRandomNumberGenerationSeed(std::size_t newSeed);
331  void setLabelingOptions(
332  bool activate,
333  std::size_t minCf,
334  std::size_t minDf,
335  std::size_t minLength,
336  std::size_t maxLength,
337  std::size_t maxCandidates,
338  float smoothing,
339  float mu,
340  std::size_t windowSize
341  );
342 
346 
347  void addDocument(
348  const std::string& name,
349  const std::vector<std::string>& tokens,
350  std::size_t firstToken,
351  std::size_t numTokens
352  );
353  void startTraining();
354  void train(
355  std::size_t iterations,
356  std::size_t threads
357  );
358  void label(std::size_t threads);
359 
363 
364  std::size_t load(const std::string& fileName);
365  std::size_t save(const std::string& fileName, bool full) const; //NOLINT(modernize-use-nodiscard)
366 
370 
371  void clear(bool labelingOptions);
372 
374 
377 
378  private:
379  // models
380  std::unique_ptr<HDPModel> hdpModel;
381  std::unique_ptr<HDPModelIDF> hdpModelIdf;
382  std::unique_ptr<LDAModel> ldaModel;
383  std::unique_ptr<LDAModelIDF> ldaModelIdf;
384 
385  // document names
386  std::vector<std::string> docNames;
387 
388  // state
389  bool hasDocs{false};
390  bool isPrepared{false};
391  std::size_t workersUsed{};
392 
393  // settings
394  std::size_t fixedNumberOfTopics{};
395  bool isUseIdf{false};
396  std::size_t numberOfInitialTopics{defaultNumberOfInitialTopics};
397  float initialAlpha{defaultAlpha};
398  float initialEta{defaultEta};
399  float initialGamma{defaultGamma};
400  std::size_t seed{std::random_device{}()};
401  std::size_t minTokenCf{};
402  std::size_t minTokenDf{};
403  std::size_t removeTopNTokens{};
404  std::size_t optimizationInterval{defaultOptimizationInterval};
405  std::string trainedWithVersion{};
406 
407  // labeling
408  std::unique_ptr<FoRelevance> labeler;
409  bool isLabeling{false};
410  std::size_t labelingMinCf{};
411  std::size_t labelingMinDf{};
412  std::size_t labelingMinLength{};
413  std::size_t labelingMaxLength{};
414  std::size_t labelingMaxCandidates{};
415  float labelingSmoothing{};
416  float labelingMu{};
417  std::size_t labelingWindowSize{};
418 
419  // internal helper functions
420  void initModel(bool& isHdpTo, bool& isIdfTo);
421  [[nodiscard]] std::string dictLookUp(tomoto::Vid tokenId) const;
422 
423  void checkModel(
424  const std::string& function,
425  bool& isHdpTo,
426  bool& isIdfTo
427  ) const;
428  void checkNoModel(
429  const std::string& function,
430  const std::string& errorMsg
431  ) const;
432  void checkTrained(const std::string& function) const;
433  void checkNotTrained(
434  const std::string& function,
435  const std::string& errorMsg
436  ) const;
437 
438  [[nodiscard]] const tomoto::Dictionary& getDict(
439  bool isHdp,
440  bool isIdf
441  ) const;
442  [[nodiscard]] std::size_t getLiveK(bool isIdf) const;
443  [[nodiscard]] std::size_t getK(bool isHdp, bool isIdf) const;
444  [[nodiscard]] bool isLiveTopic(bool isIdf, std::size_t topic) const;
445  [[nodiscard]] float getGamma(bool isIdf) const;
446  [[nodiscard]] std::size_t getNumberOfTables(bool isIdf) const;
447 
448  void prepareModel(bool isHdp, bool isIdf);
449  void trainModel(
450  bool isHdp,
451  bool isIdf,
452  std::size_t iterations,
453  std::size_t
454  threads
455  );
456  void loadModelInformation(
457  bool isHdp,
458  bool isIdf,
459  const std::vector<std::uint8_t>& data
460  );
461  void writeModelInformation(
462  bool isHdp,
463  bool isIdf,
464  std::vector<std::uint8_t>& dataTo
465  ) const;
466 
467  [[nodiscard]] std::vector<float> getInferredTopics(
468  bool isHdp,
469  bool isIdf,
470  const tomoto::DocumentBase * doc
471  ) const;
472 
473  [[nodiscard]] const void * get(bool isHdp, bool isIdf) const;
474 
475  // internal static helper functions (definitions only)
476  [[nodiscard]] static tomoto::RawDoc createDocument(
477  const std::string& name,
478  const std::vector<std::string>& tokens,
479  std::size_t firstToken,
480  std::size_t numTokens
481  );
482  static void readModelFileHead(std::istream& in, const std::string& fileName);
483  static void readModelFileTermWeighting(
484  std::istream& in,
485  const std::string& fileName,
486  bool& isIdfTo
487  );
488  static void readModelFileType(std::istream& in, const std::string& fileName);
489  static void resetStream(std::istream& in);
490  static void numberFromDict(
491  const PickleDict& dict,
492  const std::string& key,
493  std::size_t& valueTo
494  );
495  static void floatFromDict(
496  const PickleDict& dict,
497  const std::string& key,
498  float& valueTo
499  );
500  static void stringFromDict(
501  const PickleDict& dict,
502  const std::string& key,
503  std::string& valueTo
504  );
505 
506  static void validateLastResults(
507  std::vector<std::pair<std::string, std::vector<float>>>& results,
508  std::unordered_set<std::string>& done,
509  const std::unordered_set<std::string>::const_iterator& inserted
510  );
511 
512  // internal static helper functions (constexpr and templates)
513  [[nodiscard]] static constexpr std::string_view termWeightToString(bool isIdf) {
514  if(isIdf) {
515  return "TermWeight.IDF";
516  }
517 
518  return "TermWeight.ONE";
519  }
520 
521  template<typename T> [[nodiscard]] static bool bytesEqual(
522  const T& bytes,
523  std::string_view s
524  ) {
525  if(bytes.size() != s.size()) {
526  return false;
527  }
528 
529  for(std::size_t index{}; index < bytes.size(); ++index) {
530  if(bytes[index] != s[index]) { //NOLINT(cppcoreguidelines-pro-bounds-constant-array-index)
531  return false;
532  }
533  }
534 
535  return true;
536  }
537 
538  template<typename T> [[nodiscard]] static std::string bytesToString(
539  const T& bytes
540  ) {
541  std::string result;
542 
543  for(const auto c : bytes) {
544  if(c != '\0') {
545  result.push_back(c);
546  }
547  }
548 
549  return result;
550  }
551 
552  template<tomoto::TermWeight _tw, typename _RandGen>
553  [[nodiscard]] static std::vector<float> removeDeadTopics(
554  const std::vector<float>& results,
555  const std::unique_ptr<tomoto::HDPModel<_tw, _RandGen>>& model
556  ) {
557  std::vector<float> filtered;
558 
559  filtered.reserve(results.size());
560 
561  for(std::size_t topic{}; topic < results.size(); ++topic) {
562  if(model->isLiveTopic(topic)) {
563  filtered.push_back(results[topic]);
564  }
565  }
566 
567  return filtered;
568  }
569  };
570 
571  /*
572  * IMPLEMENTATION
573  */
574 
575  /*
576  * GETTERS
577  */
578 
580 
587  inline std::size_t TopicModel::getNumberOfDocuments() const {
588  bool isHdp{false};
589  bool isIdf{false};
590 
591  this->checkModel("getNumberOfDocuments", isHdp, isIdf);
592  this->checkTrained("getNumberOfDocuments");
593 
594  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
595  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getNumDocs);
596  }
597 
599 
607  inline std::unordered_map<std::string, std::size_t> TopicModel::getDocuments() const {
608  bool isHdp{false};
609  bool isIdf{false};
610 
611  this->checkModel("getDocuments", isHdp, isIdf);
612  this->checkTrained("getDocuments");
613 
614  std::unordered_map<std::string, std::size_t> result;
615 
616  for(std::size_t index{}; index < this->getNumberOfDocuments(); ++index) {
617  const tomoto::DocumentBase * docPtr{nullptr};
618 
619  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
620  DATA_TOPICMODEL_RETRIEVE(docPtr, isHdp, isIdf, getDoc, index);
621 
622  if(!(docPtr->docUid.empty())) {
623  result[docPtr->docUid] = index;
624  }
625  }
626 
627  return result;
628  }
629 
631 
639  inline std::size_t TopicModel::getVocabularySize() const {
640  bool isHdp{false};
641  bool isIdf{false};
642 
643  this->checkModel("getVocabularySize", isHdp, isIdf);
644  this->checkTrained("getVocabularySize");
645 
646  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
647  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getV);
648  }
649 
651 
659  inline std::size_t TopicModel::getOriginalVocabularySize() const {
660  bool isHdp{false};
661  bool isIdf{false};
662 
663  this->checkModel("getOriginalVocabularySize", isHdp, isIdf);
664  this->checkTrained("getOriginalVocabularySize");
665 
666  return this->getDict(isHdp, isIdf).size();
667  }
668 
670 
680  inline const std::vector<std::string>& TopicModel::getVocabulary() const {
681  bool isHdp{false};
682  bool isIdf{false};
683 
684  this->checkModel("getVocabulary", isHdp, isIdf);
685  this->checkTrained("getVocabulary");
686 
687  return this->getDict(isHdp, isIdf).getRaw();
688  }
689 
690 
692 
700  inline std::size_t TopicModel::getNumberOfTokens() const {
701  bool isHdp{false};
702  bool isIdf{false};
703 
704  this->checkModel("getNumberOfTokens", isHdp, isIdf);
705  this->checkTrained("getNumberOfTokens");
706 
707  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
708  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getN);
709  }
710 
712 
720  inline std::size_t TopicModel::getBurnInIterations() const {
721  bool isHdp{false};
722  bool isIdf{false};
723 
724  this->checkModel("getBurnInIterations", isHdp, isIdf);
725  this->checkTrained("getBurnInIterations");
726 
727  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
728  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getBurnInIteration);
729  }
730 
732 
740  inline std::size_t TopicModel::getIterations() const {
741  bool isHdp{false};
742  bool isIdf{false};
743 
744  this->checkModel("getIterations", isHdp, isIdf);
745  this->checkTrained("getIterations");
746 
747  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
748  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getGlobalStep);
749  }
750 
752 
760  inline std::size_t TopicModel::getParameterOptimizationInterval() const {
761  bool isHdp{false};
762  bool isIdf{false};
763 
764  this->checkModel("getParameterOptimizationInterval", isHdp, isIdf);
765  this->checkTrained("getParameterOptimizationInterval");
766 
767  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
768  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getOptimInterval);
769  }
770 
772 
780  inline std::size_t TopicModel::getRandomNumberGenerationSeed() const {
781  bool isHdp{false};
782  bool isIdf{false};
783 
784  this->checkModel("getRandomNumberGenerationSeed", isHdp, isIdf);
785  this->checkTrained("getRandomNumberGenerationSeed");
786 
787  return this->seed;
788  }
789 
791 
799  inline std::string_view TopicModel::getModelName() const {
800  bool isHdp{false};
801  bool isIdf{false};
802 
803  this->checkModel("getModelName", isHdp, isIdf);
804 
805  if(isHdp) {
806  return hdpModelName;
807  }
808 
809  return ldaModelName;
810  }
811 
813 
821  inline std::string_view TopicModel::getTermWeighting() const {
822  bool isHdp{false};
823  bool isIdf{false};
824 
825  this->checkModel("getTermWeighting", isHdp, isIdf);
826 
827  return TopicModel::termWeightToString(isIdf);
828  }
829 
831 
842  inline std::size_t TopicModel::getDocumentId(const std::string& name) const {
843  bool isHdp{false};
844  bool isIdf{false};
845 
846  this->checkModel("getDocumentId", isHdp, isIdf);
847 
848  std::size_t id{};
849 
850  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
851  DATA_TOPICMODEL_RETRIEVE(id, isHdp, isIdf, getDocIdByUid, name);
852 
853  if(id == std::numeric_limits<std::size_t>::max()) {
854  throw Exception(
855  "getDocumentId():"
856  " No document named '"
857  + name
858  + "' has been added to the model"
859  );
860  }
861 
862  return id;
863  }
864 
866 
875  inline std::vector<std::string> TopicModel::getRemovedTokens() const {
876  bool isHdp{false};
877  bool isIdf{false};
878 
879  this->checkModel("getRemovedTokens", isHdp, isIdf);
880  this->checkTrained("getRemovedTokens");
881 
882  const auto& dict{
883  this->getDict(isHdp, isIdf)
884  };
885  const auto& size{dict.size()};
886  std::vector<std::string> removed;
887 
888  for(auto tokendIndex{size - this->removeTopNTokens}; tokendIndex < size; ++tokendIndex) {
889  removed.emplace_back(dict.toWord(tokendIndex));
890  }
891 
892  return removed;
893  }
894 
896 
906  inline std::size_t TopicModel::getNumberOfTopics() const {
907  bool isHdp{false};
908  bool isIdf{false};
909 
910  this->checkModel("getNumberOfTopics", isHdp, isIdf);
911  this->checkTrained("getNumberOfTopics");
912 
913  if(isHdp) {
914  return this->getLiveK(isIdf);
915  }
916 
917  return this->fixedNumberOfTopics;
918  }
919 
921 
931  inline std::vector<std::size_t> TopicModel::getTopics() const {
932  bool isHdp{false};
933  bool isIdf{false};
934 
935  this->checkModel("getTopics", isHdp, isIdf);
936  this->checkTrained("getTopics");
937 
938  std::vector<std::size_t> topicIds;
939  std::size_t maxK{};
940 
941  if(isHdp) {
942  topicIds.reserve(this->getLiveK(isIdf));
943 
944  maxK = this->getK(true, isIdf);
945 
946  for(std::size_t k{}; k < maxK; ++k) {
947  if(this->isLiveTopic(isIdf, k)) {
948  topicIds.emplace_back(k);
949  }
950  }
951  }
952  else {
953  topicIds.reserve(this->fixedNumberOfTopics);
954 
955  for(std::size_t k{}; k < this->fixedNumberOfTopics; ++k) {
956  topicIds.emplace_back(k);
957  }
958  }
959 
960  return topicIds;
961  }
962 
964 
972  inline std::vector<std::pair<std::size_t, std::uint64_t>> TopicModel::getTopicsSorted() const {
973  bool isHdp{false};
974  bool isIdf{false};
975 
976  this->checkModel("getTopicsSorted", isHdp, isIdf);
977  this->checkTrained("getTopicsSorted");
978 
979  std::vector<std::pair<std::size_t, std::uint64_t>> topics;
980  std::vector<std::uint64_t> counts;
981 
982  topics.reserve(this->getK(isHdp, isIdf));
983 
984  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
985  DATA_TOPICMODEL_RETRIEVE_NOARGS(counts, isHdp, isIdf, getCountByTopic);
986 
987  std::size_t topicIndex{};
988 
989  for(const auto count : counts) {
990  if(!isHdp || this->isLiveTopic(isIdf, topicIndex)) {
991  topics.emplace_back(topicIndex, count);
992  }
993 
994  ++topicIndex;
995  }
996 
997  std::sort(topics.begin(), topics.end(), [](const auto& a, const auto& b) {
998  return a.second > b.second;
999  });
1000 
1001  return topics;
1002  }
1003 
1005 
1013  bool isHdp{false};
1014  bool isIdf{false};
1015 
1016  this->checkModel("getLogLikelihoodPerToken", isHdp, isIdf);
1017  this->checkTrained("getLogLikelihoodPerToken");
1018 
1019  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1020  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getLLPerWord);
1021  }
1022 
1024 
1031  inline double TopicModel::getTokenEntropy() const {
1032  bool isHdp{false};
1033  bool isIdf{false};
1034 
1035  this->checkModel("getTokenEntropy", isHdp, isIdf);
1036  this->checkTrained("getTokenEntropy");
1037 
1038  std::vector<std::uint64_t> vocabularyFrequencies;
1039  std::uint64_t vocabularyUsed{};
1040 
1041  // retrieve vocabulary frequencies
1042  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1043  DATA_TOPICMODEL_RETRIEVE_NOARGS(vocabularyFrequencies, isHdp, isIdf, getVocabCf);
1044  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1045  DATA_TOPICMODEL_RETRIEVE_NOARGS(vocabularyUsed, isHdp, isIdf, getV);
1046 
1047  // sum up for normalization
1048  const auto frequencySum{
1049  std::accumulate(
1050  vocabularyFrequencies.begin(),
1051  vocabularyFrequencies.begin() + vocabularyUsed,
1052  std::uint64_t{}
1053  )
1054  };
1055 
1056  std::vector<double> normalizedFrequencies;
1057 
1058  normalizedFrequencies.reserve(vocabularyUsed);
1059 
1060  for(
1061  auto it{vocabularyFrequencies.begin()};
1062  it < vocabularyFrequencies.begin() + vocabularyUsed;
1063  ++it
1064  ) {
1065  normalizedFrequencies.push_back(static_cast<double>(*it) / frequencySum);
1066  }
1067 
1068  return std::accumulate(
1069  normalizedFrequencies.begin(),
1070  normalizedFrequencies.end(),
1071  0.,
1072  [](double a, double b) {
1073  return a + b * std::log(b);
1074  }
1075  );
1076  }
1077 
1079 
1092  inline std::vector<std::pair<std::string, float>> TopicModel::getTopicTopNTokens(
1093  std::size_t topic,
1094  std::size_t n
1095  ) const {
1096  bool isHdp{false};
1097  bool isIdf{false};
1098 
1099  this->checkModel("getTopicTopNTokens", isHdp, isIdf);
1100  this->checkTrained("getTopicTopNTokens");
1101 
1102  std::vector<std::pair<tomoto::Vid, float>> tokenIds;
1103 
1104  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1105  DATA_TOPICMODEL_RETRIEVE(tokenIds, isHdp, isIdf, getWidsByTopicSorted, topic, n);
1106 
1107  std::vector<std::pair<std::string, float>> tokens;
1108 
1109  tokens.reserve(n);
1110 
1111  for(const auto& tokenId : tokenIds) {
1112  tokens.emplace_back(this->dictLookUp(tokenId.first), tokenId.second);
1113  }
1114 
1115  return tokens;
1116  }
1117 
1119 
1134  inline std::vector<std::pair<std::string, float>> TopicModel::getTopicTopNLabels(
1135  std::size_t topic,
1136  std::size_t n
1137  ) const {
1138  bool isHdp{false};
1139  bool isIdf{false};
1140 
1141  this->checkModel("getTopicTopNLabels", isHdp, isIdf);
1142  this->checkTrained("getTopicTopNLabels");
1143 
1144  if(n == 0) {
1145  return std::vector<std::pair<std::string, float>>{};
1146  }
1147 
1148  if(!(this->labeler)) {
1149  throw Exception(
1150  "getTopicTopNLabels():"
1151  " Topics have not been labeled"
1152  );
1153  }
1154 
1155  return this->labeler->getLabels(topic, n);
1156  }
1157 
1159 
1180  inline std::vector<std::pair<std::string, std::vector<float>>> TopicModel::getDocumentsTopics(
1181  std::unordered_set<std::string>& done
1182  ) const {
1183  bool isHdp{false};
1184  bool isIdf{false};
1185 
1186  this->checkModel("getDocumentsTopics", isHdp, isIdf);
1187  this->checkTrained("getDocumentsTopics");
1188 
1189  std::vector<std::pair<std::string, std::vector<float>>> results;
1190  const auto total{this->getNumberOfDocuments()};
1191 
1192  for(std::size_t docId{}; docId < total; ++docId) {
1193  const tomoto::DocumentBase * doc{nullptr};
1194 
1195  DATA_TOPICMODEL_RETRIEVE(doc, isHdp, isIdf, getDoc, docId);
1196 
1197  if(doc->docUid.empty()) {
1198  continue;
1199  }
1200 
1201  const auto inserted{done.insert(doc->docUid)};
1202 
1203  if(inserted.second) {
1204  results.emplace_back(
1205  doc->docUid,
1206  this->getInferredTopics(isHdp, isIdf, doc)
1207  );
1208 
1209  // remove last results if all values are NaN
1210  TopicModel::validateLastResults(results, done, inserted.first);
1211  }
1212  }
1213 
1214  return results;
1215  }
1216 
1218 
1239  inline std::vector<std::vector<float>> TopicModel::getDocumentsTopics(
1240  const std::vector<std::vector<std::string>>& documents,
1241  std::size_t maxIterations,
1242  std::size_t numberOfWorkers
1243  ) const {
1244  bool isHdp{false};
1245  bool isIdf{false};
1246 
1247  this->checkModel("getDocumentsTopics", isHdp, isIdf);
1248  this->checkTrained("getDocumentsTopics");
1249 
1250  // create documents
1251  std::vector<std::unique_ptr<tomoto::DocumentBase>> docUPtrs(documents.size());
1252  std::size_t docIndex{};
1253 
1254  for(const auto& tokens : documents) {
1255  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1257  docUPtrs[docIndex],
1258  isHdp,
1259  isIdf,
1260  makeDoc,
1261  TopicModel::createDocument(
1262  "doc" + std::to_string(docIndex),
1263  tokens,
1264  0,
1265  tokens.size()
1266  )
1267  );
1268 
1269  if(!(docUPtrs[docIndex])) {
1270  throw Exception(
1271  "getDocumentsTopics():"
1272  " Could not create document 'doc"
1273  + std::to_string(docIndex)
1274  + "'"
1275  );
1276  }
1277 
1278  ++docIndex;
1279  }
1280 
1281  // get C-style pointers for underlying API
1282  std::vector<tomoto::DocumentBase *> docPtrs(documents.size(), nullptr);
1283 
1284  std::transform(docUPtrs.begin(), docUPtrs.end(), docPtrs.begin(), [](const auto& uPtr) {
1285  return uPtr.get();
1286  });
1287 
1288  // infer topic distributions for documents
1289  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1291  isHdp,
1292  isIdf,
1293  infer,
1294  docPtrs,
1295  maxIterations,
1296  -1.F, /* currently not used */
1297  numberOfWorkers,
1298  tomoto::ParallelScheme::default_,
1299  false
1300  );
1301 
1302  std::vector<std::vector<float>> results;
1303 
1304  results.reserve(documents.size());
1305 
1306  for(const auto * doc : docPtrs) {
1307  results.emplace_back(this->getInferredTopics(isHdp, isIdf, doc));
1308  }
1309 
1310  return results;
1311  }
1312 
1314 
1323  bool isHdp{false};
1324  bool isIdf{false};
1325 
1326  this->checkModel("getModelInfo", isHdp, isIdf);
1327  this->checkTrained("getModelInfo");
1328 
1329  TopicModelInfo information;
1330 
1331  information.modelName = this->getModelName();
1333  information.numberOfDocuments = this->getNumberOfDocuments();
1334  information.numberOfTokens = this->getNumberOfTokens();
1335  information.sizeOfVocabulary = this->getOriginalVocabularySize();
1336  information.sizeOfVocabularyUsed = this->getVocabularySize();
1337  information.tokenEntropy = this->getTokenEntropy();
1338  information.removedTokens = this->getRemovedTokens();
1339  information.numberOfIterations = this->getIterations();
1340  information.numberOfBurnInSteps = this->getBurnInIterations();
1342  information.logLikelihoodPerToken = this->getLogLikelihoodPerToken();
1343  information.weighting = this->getTermWeighting();
1344  information.minCollectionFrequency = this->minTokenCf;
1345  information.minDocumentFrequency = this->minTokenDf;
1346  information.numberOfTopTokensToBeRemoved = this->removeTopNTokens;
1347  information.initialAlpha = this->initialAlpha;
1348  information.initialEta = this->initialEta;
1349  information.seed = this->seed;
1350  information.trainedWithVersion = this->trainedWithVersion;
1351  information.numberOfTopics = this->getNumberOfTopics();
1352 
1353  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1354  DATA_TOPICMODEL_RETRIEVE_NOARGS(information.alpha, isHdp, isIdf, getAlpha);
1355  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1356  DATA_TOPICMODEL_RETRIEVE_NOARGS(information.eta, isHdp, isIdf, getEta);
1357 
1358  if(isHdp) {
1359  information.numberOfInitialTopics = this->numberOfInitialTopics;
1360  information.gamma = this->getGamma(isIdf);
1361  information.initialGamma = this->initialGamma;
1362  information.numberOfTables = this->getNumberOfTables(isIdf);
1363  }
1364  else {
1365  // get alpha for each topic (LDA only)
1366  information.alphas.reserve(information.numberOfTopics);
1367 
1368  for(std::size_t topic{}; topic < information.numberOfTopics; ++topic) {
1369  if(isIdf) {
1370  information.alphas.push_back(this->ldaModelIdf->getAlpha(topic));
1371  }
1372  else {
1373  information.alphas.push_back(this->ldaModel->getAlpha(topic));
1374  }
1375  }
1376  }
1377 
1378  return information;
1379  }
1380 
1381  /*
1382  * SETTERS
1383  */
1384 
1386 
1394  inline void TopicModel::setFixedNumberOfTopics(std::size_t k) {
1395  this->checkNoModel(
1396  "setFixedNumberOfTopics",
1397  "Fixed number of topics cannot be set"
1398  );
1399 
1400  this->fixedNumberOfTopics = k;
1401  }
1402 
1404 
1411  inline void TopicModel::setUseIdf(bool idf) {
1412  this->checkNoModel(
1413  "setUseIdf",
1414  "Term weighting cannot be set to IDF"
1415  );
1416 
1417  this->isUseIdf = idf;
1418  }
1419 
1421 
1428  inline void TopicModel::setBurnInIteration(std::size_t skipIterations) {
1429  bool isHdp{false};
1430  bool isIdf{false};
1431 
1432  this->initModel(isHdp, isIdf);
1433  this->checkNotTrained(
1434  "setBurnInIteration",
1435  "Iterations cannot be burned"
1436  );
1437 
1438  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1439  DATA_TOPICMODEL_CALL(isHdp, isIdf, setBurnInIteration, skipIterations);
1440  }
1441 
1443 
1461  std::size_t collectionFrequency,
1462  std::size_t documentFrequency,
1463  std::size_t fixedNumberOfTopTokens
1464  ) {
1465  this->checkNotTrained(
1466  "setTokenRemoval",
1467  "Stopword settings cannot be changed"
1468  );
1469 
1470  this->minTokenCf = collectionFrequency;
1471  this->minTokenDf = documentFrequency;
1472  this->removeTopNTokens = fixedNumberOfTopTokens;
1473  }
1474 
1476 
1496  std::size_t initialTopics,
1497  float alpha,
1498  float eta,
1499  float gamma
1500  ) {
1501  this->checkNoModel(
1502  "setInitialParameters",
1503  "Cannot set initial parameters"
1504  );
1505 
1506  this->numberOfInitialTopics = initialTopics;
1507  this->initialAlpha = alpha;
1508  this->initialEta = eta;
1509  this->initialGamma = gamma;
1510  }
1511 
1513 
1521  inline void TopicModel::setParameterOptimizationInterval(std::size_t interval) {
1522  this->checkNoModel(
1523  "setParameterOptimizationInterval",
1524  "Cannot set parameter optimization interval"
1525  );
1526 
1527  this->optimizationInterval = interval;
1528  }
1529 
1531 
1538  inline void TopicModel::setRandomNumberGenerationSeed(std::size_t newSeed) {
1539  this->checkNoModel(
1540  "setRandomNumberGenerationSeed",
1541  "Cannot set seed for random number generation"
1542  );
1543 
1544  this->seed = newSeed;
1545  }
1546 
1548 
1583  bool activate,
1584  std::size_t minCf,
1585  std::size_t minDf,
1586  std::size_t minLength,
1587  std::size_t maxLength,
1588  std::size_t maxCandidates,
1589  float smoothing,
1590  float mu,
1591  std::uint64_t windowSize
1592  ) {
1593  this->isLabeling = activate;
1594  this->labelingMinCf = minCf;
1595  this->labelingMinDf = minDf;
1596  this->labelingMinLength = minLength;
1597  this->labelingMaxLength = maxLength;
1598  this->labelingMaxCandidates = maxCandidates;
1599  this->labelingSmoothing = smoothing;
1600  this->labelingMu = mu;
1601  this->labelingWindowSize = windowSize;
1602 
1603  // re-label if necessary
1604  if(this->labeler) {
1605  this->label(this->workersUsed);
1606  }
1607  }
1608 
1609  /*
1610  * TOPIC MODELLING
1611  */
1612 
1614 
1635  const std::string& name,
1636  const std::vector<std::string>& tokens,
1637  std::size_t firstToken,
1638  std::size_t numTokens
1639  ) {
1640  bool isHdp{false};
1641  bool isIdf{false};
1642 
1643  this->initModel(isHdp, isIdf);
1644  this->checkNotTrained(
1645  "addDocument",
1646  "Documents cannot be added"
1647  );
1648 
1649  // add name
1650  this->docNames.emplace_back(name);
1651 
1652  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1654  isHdp,
1655  isIdf,
1656  addDoc,
1657  TopicModel::createDocument(
1658  this->docNames.back(),
1659  tokens,
1660  firstToken,
1661  numTokens
1662  )
1663  );
1664 
1665  if(!(this->hasDocs)) {
1666  this->hasDocs = numTokens > 0;
1667  }
1668  }
1669 
1671 
1681  bool isHdp{false};
1682  bool isIdf{false};
1683 
1684  this->checkModel("startTraining", isHdp, isIdf);
1685  this->prepareModel(isHdp, isIdf);
1686  this->trainModel(isHdp, isIdf, 0, 1);
1687 
1688  this->trainedWithVersion = Helper::Versions::getTomotoVersion();
1689  }
1690 
1692 
1707  inline void TopicModel::train(
1708  std::size_t iterations,
1709  std::size_t threads
1710  ) {
1711  bool isHdp{false};
1712  bool isIdf{false};
1713 
1714  this->checkModel("train", isHdp, isIdf);
1715  this->prepareModel(isHdp, isIdf);
1716  this->trainModel(isHdp, isIdf, iterations, threads);
1717  }
1718 
1720 
1736  inline void TopicModel::label(std::size_t threads) {
1737  if(!(this->isLabeling)) {
1738  this->labeler.reset();
1739 
1740  return;
1741  }
1742 
1743  bool isHdp{false};
1744  bool isIdf{false};
1745 
1746  this->checkModel("label", isHdp, isIdf);
1747  this->checkTrained("label");
1748 
1749  this->workersUsed = threads;
1750 
1751  // extract topic label candidates
1752  PMIExtractor extractor(
1753  this->labelingMinCf,
1754  this->labelingMinDf,
1755  this->labelingMinLength,
1756  this->labelingMaxLength,
1757  this->labelingMaxCandidates
1758  );
1759 
1760  const auto * interfacePtr{
1761  static_cast<const ITopicModel *>(this->get(isHdp, isIdf))
1762  };
1763 
1764  auto labelCandidates{extractor.extract(interfacePtr)};
1765 
1766  // create labeler
1767  constexpr auto LAMBDA{0.2F};
1768 
1769  this->labeler = std::make_unique<FoRelevance>(
1770  interfacePtr,
1771  labelCandidates.begin(),
1772  labelCandidates.end(),
1773  this->labelingMinDf,
1774  this->labelingSmoothing,
1775  LAMBDA, /* not used yet */
1776  this->labelingMu,
1777  this->labelingWindowSize == 0 ?
1778  std::numeric_limits<std::size_t>::max()
1779  : this->labelingWindowSize,
1780  threads
1781  );
1782  }
1783 
1784  /*
1785  * LOAD AND SAVE
1786  */
1787 
1789 
1804  inline size_t TopicModel::load(const std::string& fileName) {
1805  this->clear(false);
1806 
1807  bool isHdp{false};
1808  bool isIdf{false};
1809 
1810  // open the file
1811  std::ifstream in(fileName.c_str(), std::ios::binary);
1812 
1813  if(!in.is_open()) {
1814  throw Exception(
1815  "TopicModel::load():"
1816  " Could not read from '"
1817  + fileName
1818  + "'"
1819  );
1820  }
1821 
1822  // read the file head (= model type)
1823  TopicModel::readModelFileHead(in, fileName);
1824 
1825  // read and set the term weighting scheme
1826  TopicModel::readModelFileTermWeighting(in, fileName, isIdf);
1827 
1828  this->setUseIdf(isIdf);
1829 
1830  // read the file type
1831  TopicModel::readModelFileType(in, fileName);
1832 
1833  // return to the beginning of the file
1834  TopicModel::resetStream(in);
1835 
1836  // initialize and load the model
1837  std::vector<uint8_t> data;
1838 
1839  this->initModel(isHdp, isIdf);
1840 
1841  try {
1842  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1844  isHdp,
1845  isIdf,
1846  loadModel,
1847  in,
1848  &data
1849  );
1850  }
1851  catch(...) {
1852  // if loading of the model failed, clear it and try another algorithm
1853  this->clear(false);
1854 
1855  if(isHdp) { /* if the algorithm was set to HDP, set it to LDA */
1856  this->fixedNumberOfTopics = defaultNumberOfInitialTopics;
1857  }
1858 
1859  // return to the beginning of the file
1860  TopicModel::resetStream(in);
1861 
1862  // initialize and load the model
1863  this->initModel(isHdp, isIdf);
1864 
1865  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1867  isHdp,
1868  isIdf,
1869  loadModel,
1870  in,
1871  &data
1872  );
1873  }
1874 
1875  // get number of bytes (best guess)
1876  const auto bytesRead{in.tellg()};
1877 
1878  // close the file
1879  in.close();
1880 
1881  // retrieve additional information about the loaded model
1882  this->loadModelInformation(isHdp, isIdf, data);
1883 
1884  return bytesRead;
1885  }
1886 
1888 
1904  inline std::size_t TopicModel::save(const std::string& fileName, bool full) const {
1905  bool isHdp{false};
1906  bool isIdf{false};
1907 
1908  this->checkModel("save", isHdp, isIdf);
1909  this->checkTrained("save");
1910 
1911  // open file to write model to
1912  std::ofstream out(fileName.c_str(), std::ios::binary);
1913 
1914  if(!out.is_open()) {
1915  throw Exception(
1916  "TopicModel::save():"
1917  " Could not write to '"
1918  + fileName
1919  + "'"
1920  );
1921  }
1922 
1923  // add additional information to the saved model
1924  std::vector<uint8_t> data;
1925 
1926  this->writeModelInformation(isHdp, isIdf, data);
1927 
1928  // write model to file
1929  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
1930  DATA_TOPICMODEL_CALL(isHdp, isIdf, saveModel, out, full, &data);
1931 
1932  // get number of written bytes (best guess)
1933  const auto bytesWritten{out.tellp()};
1934 
1935  // close file
1936  out.close();
1937 
1938  return bytesWritten;
1939  }
1940 
1941  /*
1942  * CLEANUP
1943  */
1944 
1946 
1949  inline void TopicModel::clear(bool labelingOptions) {
1950  this->hdpModel.reset();
1951  this->hdpModelIdf.reset();
1952  this->ldaModel.reset();
1953  this->ldaModelIdf.reset();
1954 
1955  Helper::Memory::free(this->docNames);
1956 
1957  this->hasDocs = false;
1958  this->isPrepared = false;
1959 
1960  this->fixedNumberOfTopics = 0;
1961  this->numberOfInitialTopics = defaultNumberOfInitialTopics;
1962  this->initialAlpha = defaultAlpha;
1963  this->initialEta = defaultEta;
1964  this->initialGamma = defaultGamma;
1965  this->seed = std::random_device{}();
1966  this->minTokenCf = 0;
1967  this->minTokenDf = 0;
1968  this->removeTopNTokens = 0;
1969  this->optimizationInterval = defaultOptimizationInterval;
1970 
1971  this->trainedWithVersion.clear();
1972 
1973  this->labeler.reset();
1974 
1975  if(labelingOptions) {
1976  this->isLabeling = false;
1977  this->labelingMinCf = 0;
1978  this->labelingMinDf = 0;
1979  this->labelingMinLength = 0;
1980  this->labelingMaxLength = 0;
1981  this->labelingMaxCandidates = 0;
1982  this->labelingSmoothing = 0.F;
1983  this->labelingMu = 0.F;
1984  this->labelingWindowSize = 0;
1985  }
1986  }
1987 
1988  /*
1989  * INTERNAL HELPER FUNCTIONS (private)
1990  */
1991 
1992  // initialize model
1993  inline void TopicModel::initModel(bool& isHdpTo, bool& isIdfTo) {
1994  if(
1995  !(this->hdpModel)
1996  && !(this->hdpModelIdf)
1997  && !(this->ldaModel)
1998  && !(this->ldaModelIdf)
1999  ) {
2000  if(this->fixedNumberOfTopics == 0) {
2001  if(this->isUseIdf) {
2002  this->hdpModelIdf = std::make_unique<HDPModelIDF>(
2003  this->numberOfInitialTopics,
2004  this->initialAlpha,
2005  this->initialEta,
2006  this->initialGamma,
2007  this->seed
2008  );
2009  }
2010  else {
2011  this->hdpModel = std::make_unique<HDPModel>(
2012  this->numberOfInitialTopics,
2013  this->initialAlpha,
2014  this->initialEta,
2015  this->initialGamma,
2016  this->seed
2017  );
2018  }
2019  }
2020  else if(this->isUseIdf) {
2021  this->ldaModelIdf = std::make_unique<LDAModelIDF>(
2022  this->fixedNumberOfTopics,
2023  this->initialAlpha,
2024  this->initialEta,
2025  this->seed
2026  );
2027  }
2028  else {
2029  this->ldaModel = std::make_unique<LDAModel>(
2030  this->fixedNumberOfTopics,
2031  this->initialAlpha,
2032  this->initialEta,
2033  this->seed
2034  );
2035  }
2036  }
2037 
2038  if(this->hdpModel) {
2039  isHdpTo = true;
2040  isIdfTo = false;
2041  }
2042  else if(this->hdpModelIdf) {
2043  isHdpTo = true;
2044  isIdfTo = true;
2045  }
2046  else if(this->ldaModel) {
2047  isHdpTo = false;
2048  isIdfTo = false;
2049  }
2050  else if(this->ldaModelIdf){
2051  isHdpTo = false;
2052  isIdfTo = true;
2053  }
2054  else {
2055  throw Exception(
2056  "TopicModel::initModel():"
2057  " No model has been loaded."
2058  );
2059  }
2060  }
2061 
2062  // look up token ID in dictionary
2063  inline std::string TopicModel::dictLookUp(tomoto::Vid tokenId) const {
2064  bool isHdp{false};
2065  bool isIdf{false};
2066 
2067  this->checkModel("dictLookUp", isHdp, isIdf);
2068  this->checkTrained("dictLookUp");
2069 
2070  std::string result;
2071 
2072  return this->getDict(isHdp, isIdf).toWord(tokenId);
2073  }
2074 
2075  // check model
2076  inline void TopicModel::checkModel(
2077  const std::string& function,
2078  bool& isHdpTo,
2079  bool& isIdfTo
2080  ) const {
2081  if(this->hasDocs) {
2082  if(this->hdpModel) {
2083  isHdpTo = true;
2084  isIdfTo = false;
2085 
2086  return;
2087  }
2088 
2089  if(this->hdpModelIdf) {
2090  isHdpTo = true;
2091  isIdfTo = true;
2092 
2093  return;
2094  }
2095 
2096  if(this->ldaModel) {
2097  isHdpTo = false;
2098  isIdfTo = false;
2099 
2100  return;
2101  }
2102 
2103  if(this->ldaModelIdf) {
2104  isHdpTo = false;
2105  isIdfTo = true;
2106 
2107  return;
2108  }
2109  }
2110 
2111  throw Exception(
2112  "TopicModel::"
2113  + function
2114  + "(): No documents have been added"
2115  " or the model has already been cleared"
2116  );
2117  }
2118 
2119  // check whether model has not been initialized
2120  inline void TopicModel::checkNoModel(
2121  const std::string& function,
2122  const std::string& errorMsg
2123  ) const {
2124  if(
2125  this->hdpModel
2126  || this->hdpModelIdf
2127  || this->ldaModel
2128  || this->ldaModelIdf
2129  ) {
2130  throw Exception(
2131  "TopicModel::"
2132  + function
2133  + "(): "
2134  + errorMsg
2135  + " after the model has been initialized"
2136  );
2137  }
2138  }
2139 
2140  // check whether training has been started
2141  inline void TopicModel::checkTrained(const std::string& function) const {
2142  if(!(this->isPrepared)) {
2143  throw Exception(
2144  "TopicModel::"
2145  + function
2146  + "(): The model has not yet been trained"
2147  );
2148  }
2149  }
2150 
2151  // check whether training has not yet been started
2152  inline void TopicModel::checkNotTrained(
2153  const std::string& function,
2154  const std::string& errorMsg
2155  ) const {
2156  if(this->isPrepared) {
2157  throw Exception(
2158  "TopicModel::"
2159  + function
2160  + "(): "
2161  + errorMsg
2162  + " after the model has already been trained"
2163  );
2164  }
2165  }
2166 
2167  // get dictionary (without further checking)
2168  inline const tomoto::Dictionary& TopicModel::getDict(bool isHdp, bool isIdf) const {
2169  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
2170  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getVocabDict);
2171  }
2172 
2173  // get number of topics (without further checking)
2174  inline std::size_t TopicModel::getLiveK(bool isIdf) const {
2175  if(isIdf) {
2176  return this->hdpModelIdf->getLiveK();
2177  }
2178 
2179  return this->hdpModel->getLiveK();
2180  }
2181 
2182  // get number of topics (without further checking)
2183  inline std::size_t TopicModel::getK(bool isHdp, bool isIdf) const {
2184  if(!isHdp) {
2185  return this->fixedNumberOfTopics;
2186  }
2187 
2188  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
2189  DATA_TOPICMODEL_RETURN(isHdp, isIdf, getK);
2190  }
2191 
2192  // check whether topic is alive (without additional checking)
2193  inline bool TopicModel::isLiveTopic(bool isIdf, std::size_t topic) const {
2194  if(isIdf) {
2195  return this->hdpModelIdf->isLiveTopic(topic);
2196  }
2197 
2198  return this->hdpModel->isLiveTopic(topic);
2199  }
2200 
2201  // get concentration coefficient of the Dirichlet Process for table-topic (without further checking)
2202  inline float TopicModel::getGamma(bool isIdf) const {
2203  if(isIdf) {
2204  return this->hdpModelIdf->getGamma();
2205  }
2206 
2207  return this->hdpModel->getGamma();
2208  }
2209 
2210  // get number of tables in the LDP model
2211  inline std::size_t TopicModel::getNumberOfTables(bool isIdf) const {
2212  if(isIdf) {
2213  return this->hdpModelIdf->getTotalTables();
2214  }
2215 
2216  return this->hdpModel->getTotalTables();
2217  }
2218 
2219  // prepare model (without further checking)
2220  inline void TopicModel::prepareModel(bool isHdp, bool isIdf) {
2221  if(!(this->isPrepared)) {
2222  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
2224  isHdp,
2225  isIdf,
2226  prepare,
2227  true,
2228  this->minTokenCf,
2229  this->minTokenDf,
2230  this->removeTopNTokens
2231  );
2232 
2233  this->isPrepared = true;
2234  }
2235  }
2236 
2237  // train model (without further checking)
2238  inline void TopicModel::trainModel(bool isHdp, bool isIdf, std::size_t iterations, std::size_t threads) {
2239  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
2241  isHdp,
2242  isIdf,
2243  train,
2244  iterations,
2245  threads,
2246  tomoto::ParallelScheme::default_
2247  );
2248  }
2249 
2250  // load model information after reading model from file
2251  inline void TopicModel::loadModelInformation(
2252  bool isHdp,
2253  bool isIdf,
2254  const std::vector<std::uint8_t>& data
2255  ) {
2256  // get model information from a dictionary generated by reading Python pickle data
2257  PickleDict dict(data);
2258 
2259  TopicModel::numberFromDict(dict, "min_cf", this->minTokenCf);
2260  TopicModel::numberFromDict(dict, "min_df", this->minTokenDf);
2261  TopicModel::numberFromDict(dict, "rm_top", this->removeTopNTokens);
2262  TopicModel::numberFromDict(dict, "initial_k", this->numberOfInitialTopics); /* HDP only*/
2263  TopicModel::numberFromDict(dict, "k", this->fixedNumberOfTopics); /* LDA only */
2264  TopicModel::numberFromDict(dict, "seed", this->seed);
2265 
2266  TopicModel::floatFromDict(dict, "alpha", this->initialAlpha);
2267  TopicModel::floatFromDict(dict, "eta", this->initialEta);
2268  TopicModel::floatFromDict(dict, "gamma", this->initialGamma); /* HDP only */
2269 
2270  TopicModel::stringFromDict(dict, "version", this->trainedWithVersion);
2271 
2272  // check whether model has been trained
2273  std::size_t iterations{};
2274 
2275  //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
2276  DATA_TOPICMODEL_RETRIEVE_NOARGS(iterations, isHdp, isIdf, getGlobalStep);
2277 
2278  this->hasDocs = true;
2279 
2280  if(iterations > 0) {
2281  this->isPrepared = true;
2282 
2283  this->startTraining();
2284  }
2285  }
2286 
2287  // write model information for writing the module to file
2288  inline void TopicModel::writeModelInformation(
2289  bool isHdp,
2290  bool isIdf,
2291  std::vector<std::uint8_t>& dataTo
2292  ) const {
2293  // fill dictionary with model information
2294  PickleDict dict;
2295 
2296  dict.setNumber(
2297  "tw",
2298  static_cast<std::int64_t>(
2299  isIdf ? tomoto::TermWeight::idf : tomoto::TermWeight::one
2300  )
2301  );
2302 
2303  dict.setNumber("min_cf", this->minTokenCf);
2304  dict.setNumber("min_df", this->minTokenDf);
2305  dict.setNumber("rm_top", this->removeTopNTokens);
2306 
2307  if(isHdp) {
2308  dict.setNumber("initial_k", this->numberOfInitialTopics);
2309  }
2310  else {
2311  dict.setNumber("k", this->fixedNumberOfTopics);
2312  }
2313 
2314  dict.setNumber("seed", this->seed);
2315 
2316  dict.setFloat("alpha", this->initialAlpha);
2317  dict.setFloat("eta", this->initialEta);
2318 
2319  if(isHdp) {
2320  dict.setFloat("gamma", this->initialGamma);
2321  }
2322 
2323  dict.setString("version", this->trainedWithVersion);
2324 
2325  // write dictionary as Python pickle data
2326  dict.writeTo(dataTo);
2327  }
2328 
2329  // get inferred topics from document pointer
2330  inline std::vector<float> TopicModel::getInferredTopics(
2331  bool isHdp,
2332  bool isIdf,
2333  const tomoto::DocumentBase * doc
2334  ) const {
2335  if(isHdp) {
2336  std::vector<float> topics;
2337 
2338  if(isIdf) {
2339  return TopicModel::removeDeadTopics(
2340  this->hdpModelIdf->getTopicsByDoc(
2341  *dynamic_cast<const tomoto::DocumentHDP<tomoto::TermWeight::idf> *>(doc)
2342  ),
2343  this->hdpModelIdf
2344  );
2345  }
2346 
2347  return TopicModel::removeDeadTopics(
2348  this->hdpModel->getTopicsByDoc(
2349  *dynamic_cast<const tomoto::DocumentHDP<tomoto::TermWeight::one> *>(doc)
2350  ),
2351  this->hdpModel
2352  );
2353  }
2354 
2355  if(isIdf) {
2356  return this->ldaModelIdf->getTopicsByDoc(
2357  *dynamic_cast<const tomoto::DocumentLDA<tomoto::TermWeight::idf> *>(doc)
2358  );
2359  }
2360 
2361  return this->ldaModel->getTopicsByDoc(
2362  *dynamic_cast<const tomoto::DocumentLDA<tomoto::TermWeight::one> *>(doc)
2363  );
2364  }
2365 
2366  // get const pointer to the model used
2367  inline const void * TopicModel::get(bool isHdp, bool isIdf) const {
2368  if(isHdp) {
2369  if(isIdf) {
2370  return this->hdpModelIdf.get();
2371  }
2372 
2373  return this->hdpModel.get();
2374  }
2375 
2376  if(isIdf) {
2377  return this->ldaModelIdf.get();
2378  }
2379 
2380  return this->ldaModel.get();
2381  }
2382 
2383  // create document for underlying API
2384  inline tomoto::RawDoc TopicModel::createDocument(
2385  const std::string& name,
2386  const std::vector<std::string>& tokens,
2387  std::size_t firstToken,
2388  std::size_t numTokens
2389  ) {
2390  tomoto::RawDoc doc;
2391  const auto documentEnd{firstToken + numTokens};
2392 
2393  doc.rawWords.reserve(numTokens);
2394 
2395  for(std::size_t tokenIndex{firstToken}; tokenIndex < documentEnd; ++tokenIndex) {
2396  doc.rawWords.emplace_back(tokens.at(tokenIndex));
2397  }
2398 
2399  // share document name
2400  doc.docUid = tomoto::SharedString(name);
2401 
2402  return doc;
2403  }
2404 
2405  // check first bytes of the topic model file (indicating the type of the model)
2406  inline void TopicModel::readModelFileHead(std::istream& in, const std::string& fileName) {
2407  std::array<char, modelFileHead.size()> headBytes{};
2408 
2409  in.read(headBytes.data(), modelFileHead.size());
2410 
2411  if(!TopicModel::bytesEqual(headBytes, modelFileHead)) {
2412  throw Exception(
2413  "TopicModel::load():"
2414  " Invalid model file or unsupported model type in '"
2415  + fileName
2416  + "' (first bytes do not match tomoto's LDA model format: '"
2417  + TopicModel::bytesToString(headBytes)
2418  + "')"
2419  );
2420  }
2421  }
2422 
2423  // check and read term weighting scheme from topic model file
2424  inline void TopicModel::readModelFileTermWeighting(std::istream& in, const std::string& fileName, bool& isIdfTo) {
2425  std::array<char, modelFileTermWeightingLen> twBytes{};
2426 
2427  in.read(twBytes.data(), modelFileTermWeightingLen);
2428 
2429  if(TopicModel::bytesEqual(twBytes, modelFileTermWeightingOne)) {
2430  isIdfTo = false;
2431  }
2432  else if(TopicModel::bytesEqual(twBytes, modelFileTermWeightingIdf)) {
2433  isIdfTo = true;
2434  }
2435  else {
2436  throw Exception(
2437  "TopicModel::load():"
2438  " Invalid model file or unsupported term weighting scheme in '"
2439  + fileName
2440  + "' (term weighting scheme does not match 'one' or 'idf':"
2441  + TopicModel::bytesToString(twBytes)
2442  + "')"
2443  );
2444  }
2445  }
2446 
2447  // check file type of topic model file
2448  inline void TopicModel::readModelFileType(std::istream& in, const std::string& fileName) {
2449  std::array<char, modelFileType.size()> typeBytes{};
2450 
2451  in.read(typeBytes.data(), modelFileType.size());
2452 
2453  if(!TopicModel::bytesEqual(typeBytes, modelFileType)) {
2454  throw Exception(
2455  "TopicModel::load():"
2456  " Invalid model file '"
2457  + fileName
2458  + "' (type does not match tomoto's model format: '"
2459  + TopicModel::bytesToString(typeBytes)
2460  + "')"
2461  );
2462  }
2463  }
2464 
2465  // reset an input stream and go back to its start
2466  inline void TopicModel::resetStream(std::istream& in) {
2467  in.clear();
2468  in.seekg(0, std::ios_base::beg);
2469  }
2470 
2471  // get number from Pickle dictionary, if available
2472  inline void TopicModel::numberFromDict(const PickleDict& dict, const std::string& key, std::size_t& valueTo) {
2473  const auto entry{
2474  dict.getNumber(key)
2475  };
2476 
2477  if(entry) {
2478  valueTo = static_cast<std::size_t>(*entry);
2479  }
2480  else {
2481  valueTo = 0;
2482  }
2483  }
2484 
2485  // get floating-point number from Pickle dictionary, if available
2486  inline void TopicModel::floatFromDict(const PickleDict& dict, const std::string& key, float& valueTo) {
2487  const auto entry{
2488  dict.getFloat(key)
2489  };
2490 
2491  if(entry) {
2492  valueTo = static_cast<float>(*entry);
2493  }
2494  else {
2495  valueTo = 0.F;
2496  }
2497  }
2498 
2499  // get string from Pickle dictionary, if available
2500  inline void TopicModel::stringFromDict(const PickleDict& dict, const std::string& key, std::string& valueTo) {
2501  auto entry{
2502  dict.getString(key)
2503  };
2504 
2505  if(entry) {
2506  valueTo = std::move(*entry);
2507  }
2508  else {
2509  Helper::Memory::free(valueTo);
2510  }
2511  }
2512 
2513  // validate the results added last, remove them if all values are NaN
2514  inline void TopicModel::validateLastResults(
2515  std::vector<std::pair<std::string, std::vector<float>>>& results,
2516  std::unordered_set<std::string>& done,
2517  const std::unordered_set<std::string>::const_iterator& inserted
2518  ) {
2519  if(
2520  std::all_of(
2521  results.back().second.begin(),
2522  results.back().second.end(),
2523  [](const auto value) {
2524  return std::isnan(value);
2525  }
2526  )
2527  ) {
2528  results.pop_back();
2529  done.erase(inserted);
2530  }
2531  }
2532 
2533 } /* namespace crawlservpp::Data */
2534 
2535 #endif /* DATA_TOPICMODEL_HPP_ */
std::unordered_map< std::string, std::size_t > getDocuments() const
Gets a map with the documents and their indices from the model.
Definition: TopicModel.hpp:607
constexpr auto modelFileTermWeightingIdf
The term weighting scheme IDF (tf-idf) as saved in a model file.
Definition: TopicModel.hpp:216
std::size_t minCollectionFrequency
Minimum collection frequency of tokens.
Definition: TopicModelInfo.hpp:124
constexpr auto modelFileTermWeightingLen
The number of bytes determining the term weighting scheme in a model file.
Definition: TopicModel.hpp:210
std::size_t save(const std::string &fileName, bool full) const
Writes the model to a file.
Definition: TopicModel.hpp:1904
std::size_t getNumberOfTokens() const
Gets the number of tokens after training has begun.
Definition: TopicModel.hpp:700
void train(std::size_t iterations, std::size_t threads)
Trains the underlying HLDA model.
Definition: TopicModel.hpp:1707
std::size_t numberOfBurnInSteps
The number of initially skipped, i.e. burn-in, steps.
Definition: TopicModelInfo.hpp:108
std::size_t numberOfTopics
The number of topics.
Definition: TopicModelInfo.hpp:171
double tokenEntropy
The entropy of tokens in the model.
Definition: TopicModelInfo.hpp:95
std::size_t sizeOfVocabulary
Definition: TopicModelInfo.hpp:89
std::vector< std::pair< std::string, float > > getTopicTopNLabels(std::size_t topic, std::size_t n) const
Gets the top N labels for the specified topic.
Definition: TopicModel.hpp:1134
std::optional< double > getFloat(const std::string &key) const
Gets a floating-point number from the dictionary, if avaible.
Definition: PickleDict.hpp:426
void setLabelingOptions(bool activate, std::size_t minCf, std::size_t minDf, std::size_t minLength, std::size_t maxLength, std::size_t maxCandidates, float smoothing, float mu, std::size_t windowSize)
Sets the options for automated topic labeling.
Definition: TopicModel.hpp:1582
void setFixedNumberOfTopics(std::size_t k)
Sets the fixed number of topics.
Definition: TopicModel.hpp:1394
Structure containing information about the currently trained Hierarchical Dirichlet Process (HDP) mod...
Definition: TopicModelInfo.hpp:72
TopicModelInfo getModelInfo() const
Gets information about the model after training.
Definition: TopicModel.hpp:1322
Simple Python pickle dictionary.
Definition: PickleDict.hpp:136
void setParameterOptimizationInterval(std::size_t interval)
Sets the interval for parameter optimization, in iterations.
Definition: TopicModel.hpp:1521
std::size_t numberOfTopTokensToBeRemoved
The number of top tokens to be removed.
Definition: TopicModelInfo.hpp:130
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::size_t getIterations() const
Get the number of training iterations performed so far.
Definition: TopicModel.hpp:740
std::vector< std::size_t > getTopics() const
Gets the IDs of the topics.
Definition: TopicModel.hpp:931
std::size_t sizeOfVocabularyUsed
Definition: TopicModelInfo.hpp:92
constexpr auto modelFileHead
The beginning of a valid model file containing a LDA (or HDP) model.
Definition: TopicModel.hpp:207
void label(std::size_t threads)
Labels the resulting topics.
Definition: TopicModel.hpp:1736
void setInitialParameters(std::size_t initialTopics, float alpha, float eta, float gamma)
Sets the initial parameters for the model.
Definition: TopicModel.hpp:1495
void startTraining()
Starts training without performing any iteration.
Definition: TopicModel.hpp:1680
float initialAlpha
The initial concentration coefficient of the Dirichlet Process for document–table.
Definition: TopicModelInfo.hpp:136
void setNumber(const std::string &key, std::int64_t value)
Adds or overwrite a number in the dictionary.
Definition: PickleDict.hpp:467
std::size_t getDocumentId(const std::string &name) const
Gets the ID of the document with the specified name.
Definition: TopicModel.hpp:842
std::vector< std::string > removedTokens
The top tokens removed before training.
Definition: TopicModelInfo.hpp:98
void setFloat(const std::string &key, double value)
Adds or overwrites a floating-point number in the dictionary.
Definition: PickleDict.hpp:480
float alpha
The concentration coeficient of the Dirichlet Process for document-table (HDP only).
Definition: TopicModelInfo.hpp:155
std::size_t getNumberOfDocuments() const
Gets the number of added documents after training has begun.
Definition: TopicModel.hpp:587
std::size_t getBurnInIterations() const
Get the number of skipped iterations.
Definition: TopicModel.hpp:720
Class for topic modelling-specific exceptions.
Definition: TopicModel.hpp:376
std::string getTomotoVersion()
Gets the version of the tomoto library if available.
Definition: Versions.hpp:341
std::size_t numberOfTokens
The number of tokens in the model.
Definition: TopicModelInfo.hpp:86
std::size_t numberOfIterations
The number of iterations performed.
Definition: TopicModelInfo.hpp:105
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
std::size_t getParameterOptimizationInterval() const
Gets the interval for parameter optimization, in iterations.
Definition: TopicModel.hpp:760
void writeTo(Bytes &dataTo) const
Writes dictionary to Python pickle data.
Definition: PickleDict.hpp:565
std::optional< std::int64_t > getNumber(const std::string &key) const
Gets a number from the dictionary, if avaible.
Definition: PickleDict.hpp:406
constexpr auto defaultEta
The default hyperparameter for the Dirichlet distribution for topic-token.
Definition: TopicModel.hpp:194
std::size_t minDocumentFrequency
Minimum document frequency of tokens.
Definition: TopicModelInfo.hpp:127
double getLogLikelihoodPerToken() const
Gets the log-likelihood per token.
Definition: TopicModel.hpp:1012
void setTokenRemoval(std::size_t collectionFrequency, std::size_t documentFrequency, std::size_t fixedNumberOfTopTokens)
Sets which (un)common tokens to remove before training.
Definition: TopicModel.hpp:1460
constexpr auto defaultOptimizationInterval
The default interval for optimizing the parameters, in iterations.
Definition: TopicModel.hpp:204
constexpr auto ldaModelName
The name of the LDA model.
Definition: TopicModel.hpp:185
std::string modelName
The name of the model.
Definition: TopicModelInfo.hpp:77
void setRandomNumberGenerationSeed(std::size_t newSeed)
Sets the seed for random number generation.
Definition: TopicModel.hpp:1538
std::size_t numberOfTables
The number of tables.
Definition: TopicModelInfo.hpp:178
float initialEta
The initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelInfo.hpp:139
double getTokenEntropy() const
Gets the token entropy after training.
Definition: TopicModel.hpp:1031
std::string weighting
Term weighting mode as string.
Definition: TopicModelInfo.hpp:121
const std::vector< std::string > & getVocabulary() const
Gets the complete dictionary used by the model.
Definition: TopicModel.hpp:680
#define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function)
Definition: TopicModel.hpp:155
std::optional< std::string > getString(const std::string &key) const
Gets a string from the dictionary, if avaible.
Definition: PickleDict.hpp:446
std::size_t getVocabularySize() const
Gets the number of distinct tokens after training has begun.
Definition: TopicModel.hpp:639
std::string_view getModelName() const
Gets the name of the current model.
Definition: TopicModel.hpp:799
constexpr auto defaultNumberOfInitialTopics
The initial number of topics by default.
Definition: TopicModel.hpp:188
constexpr auto defaultGamma
The default concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModel.hpp:201
#define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function,...)
Definition: TopicModel.hpp:135
constexpr auto modelFileTermWeightingOne
The term weighting scheme ONE as saved in a model file.
Definition: TopicModel.hpp:213
std::vector< float > alphas
The Dirichlet priors on the per-document topic distributions (LDA only).
Definition: TopicModelInfo.hpp:158
std::vector< std::pair< std::string, std::vector< float > > > getDocumentsTopics(std::unordered_set< std::string > &done) const
Gets the topic distributions of all documents the model has been trained on, if available.
Definition: TopicModel.hpp:1180
std::size_t seed
The initial seed for random number generation.
Definition: TopicModelInfo.hpp:145
constexpr auto hdpModelName
The name of the HDP model.
Definition: TopicModel.hpp:182
std::size_t numberOfDocuments
The number of documents in the model.
Definition: TopicModelInfo.hpp:83
void setBurnInIteration(std::size_t skipIterations)
Sets the number of iterations that will be skipped at the beginnig of training.
Definition: TopicModel.hpp:1428
std::size_t load(const std::string &fileName)
Loads a model from a file.
Definition: TopicModel.hpp:1804
std::string trainedWithVersion
The version of the modeller the model has been trained with.
Definition: TopicModelInfo.hpp:148
Topic modeller.
Definition: TopicModel.hpp:257
void clear(bool labelingOptions)
Clears the model, resets its settings and frees memory.
Definition: TopicModel.hpp:1949
float initialGamma
The initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelInfo.hpp:142
double logLikelihoodPerToken
The log-likelihood per token.
Definition: TopicModelInfo.hpp:114
constexpr auto modelFileType
The tomoto file format as saved in a model file (after model head and term weighting scheme)...
Definition: TopicModel.hpp:219
std::vector< std::pair< std::size_t, std::uint64_t > > getTopicsSorted() const
Gets the IDs and counts of the topics, sorted by count.
Definition: TopicModel.hpp:972
float eta
The Dirichlet prior on the per-topic token distribution (HDP only).
Definition: TopicModelInfo.hpp:161
constexpr auto defaultAlpha
The default concentration coeficient of the Dirichlet Process for document-table. ...
Definition: TopicModel.hpp:191
std::string modelVersion
The version of the model (as string).
Definition: TopicModelInfo.hpp:80
std::string_view getTermWeighting() const
Gets the term weighting mode of the current model.
Definition: TopicModel.hpp:821
void setUseIdf(bool idf)
Sets whether to use IDF term weighting.
Definition: TopicModel.hpp:1411
Namespace for different types of data.
#define DATA_TOPICMODEL_CALL(isHdp, isIdf, function,...)
Definition: TopicModel.hpp:96
std::vector< std::string > getRemovedTokens() const
Gets the most common tokens (i.e. stopwords) that have been removed.
Definition: TopicModel.hpp:875
float gamma
The concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModelInfo.hpp:168
std::size_t getOriginalVocabularySize() const
Gets the number of distinct tokens before training.
Definition: TopicModel.hpp:659
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
std::size_t getNumberOfTopics() const
Gets the number of topics.
Definition: TopicModel.hpp:906
std::size_t getRandomNumberGenerationSeed() const
Gets the seed used for random number generation.
Definition: TopicModel.hpp:780
std::size_t numberOfInitialTopics
The initial number of topics, which will be adjusted for the data during training.
Definition: TopicModelInfo.hpp:133
#define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function)
Definition: TopicModel.hpp:116
std::size_t optimizationInterval
The optimization interval.
Definition: TopicModelInfo.hpp:111
std::vector< std::pair< std::string, float > > getTopicTopNTokens(std::size_t topic, std::size_t n) const
Gets the top N tokens for the specified topic.
Definition: TopicModel.hpp:1092
void addDocument(const std::string &name, const std::vector< std::string > &tokens, std::size_t firstToken, std::size_t numTokens)
Adds a document from a tokenized corpus.
Definition: TopicModel.hpp:1634
void setString(const std::string &key, const std::string &value)
Add or overwrites a string in the dictionary.
Definition: PickleDict.hpp:493