crawlserv/crawlservpp/TopicModel_8hpp_source.html

 /*
  *
  * ---
  *
  *  Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version in addition to the terms of any
  *  licences already herein identified.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program. If not, see <https://www.gnu.org/licenses/>.
  *
  * ---
  *
  * TopicModel.hpp
  *
  * Topic modeller using the Hierarchical Dirichlet Process (HDP) and
  *  Latent Dirichlet Allocation (LDA) algorithms.
  *
  * The former will be used if no fixed number of topics is given,
  *  the latter will be used if a fixed number of topics is given.
  *
  * Using tomoto, the underlying C++ API of tomotopy, see:
  *  https://bab2min.github.io/tomotopy/
  *
  * If you use the HDP topic modelling algorithm, please cite:
  *
  *  Teh, Y. W., Jordan, M. I., Beal, M. J., & Blei, D. M. (2005). Sharing
  *   clusters among related groups: Hierarchical Dirichlet processes.
  *   In Advances in neural information processing systems, 1385–1392.
  *
  *  Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
  *   algorithms for topic models. Journal of Machine Learning Research,
  *   10 (Aug), 1801–1828.
  *
  * If you use the LDA topic modelling algorithm, please cite:
  *
  *  Blei, D. M., Ng, A. Y., & Jordan, M. I. (2003). Latent dirichlet
  *   allocation. Journal of machine Learning research, 3(Jan), 993–1022.
  *
  *  Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
  *   algorithms for topic models. Journal of Machine Learning Research,
  *   10 (Aug), 1801–1828.
  *
  * If you use automated topic labeling, please cite:
  *
  *  Mei, Q., Shen, X., & Zhai, C. (2007). Automatic labeling of multinomial
  *   topic models. In Proceedings of the 13th ACM SIGKDD International
  *   Conference on Knowledge Discovery and Data Mining, 490–499.
  *
  *  Created on: Feb 2, 2021
  *      Author: ans
  */

 #ifndef DATA_TOPICMODEL_HPP_
 #define DATA_TOPICMODEL_HPP_

 #include "PickleDict.hpp"

 #include "../Helper/FileSystem.hpp"
 #include "../Helper/Memory.hpp"
 #include "../Helper/SilentInclude/EigenRand.h"
 #include "../Helper/SilentInclude/tomoto.h"
 #include "../Helper/Versions.hpp"
 #include "../Main/Exception.hpp"
 #include "../Struct/TopicModelInfo.hpp"

 #include <algorithm>        // std::transform
 #include <array>            // std::array
 #include <cmath>            // std::log
 #include <cstdint>          // std::uint8_t, std::uint64_t
 #include <cstdlib>          // std::size_t
 #include <fstream>          // std::ifstream, std::ofstream
 #include <ios>              // std::ios
 #include <limits>           // std::numeric_limits
 #include <memory>           // std::make_unique, std::unique_ptr
 #include <numeric>          // std::accumulate
 #include <random>           // std::random_device
 #include <string>           // std::string, std::to_string
 #include <string_view>      // std::string_view, std::string_view_literals
 #include <unordered_map>    // std::unordered_map
 #include <unordered_set>    // std::unordered_set
 #include <utility>          // std::move, std::pair
 #include <vector>           // std::vector

 // macro for calling member functions of different kinds of (pre-compiled) models
 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DATA_TOPICMODEL_CALL(isHdp, isIdf, function, ...) \
     if(isHdp) { \
         if(isIdf) { \
             this->hdpModelIdf->function(__VA_ARGS__); \
         } \
         else { \
             this->hdpModel->function(__VA_ARGS__); \
         } \
     } \
     else { \
         if(isIdf) { \
             this->ldaModelIdf->function(__VA_ARGS__); \
         } \
         else { \
             this->ldaModel->function(__VA_ARGS__); \
         } \
     }

 // macros for retrieving a value from different kinds of (pre-compiled) models
 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function) \
     if(isHdp) { \
         if(isIdf) { \
             (x) = this->hdpModelIdf->function(); \
         } \
         else { \
             (x) = this->hdpModel->function(); \
         } \
     } \
     else { \
         if(isIdf) { \
             (x) = this->ldaModelIdf->function(); \
         } \
         else { \
             (x) = this->ldaModel->function(); \
         } \
     }

 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function, ...) \
     if(isHdp) { \
         if(isIdf) { \
             (x) = this->hdpModelIdf->function(__VA_ARGS__); \
         } \
         else { \
             (x) = this->hdpModel->function(__VA_ARGS__); \
         } \
     } \
     else { \
         if(isIdf) { \
             (x) = this->ldaModelIdf->function(__VA_ARGS__); \
         } \
         else { \
             (x) = this->ldaModel->function(__VA_ARGS__); \
         } \
     }

 // macro for returning a value from different kinds of (pre-compiled) models
 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
 #define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function) \
     if(isHdp) { \
         if(isIdf) { \
             return this->hdpModelIdf->function(); \
         } \
         \
         return this->hdpModel->function(); \
     } \
     \
     if(isIdf) { \
         return this->ldaModelIdf->function(); \
     } \
     \
     return this->ldaModel->function();

 namespace crawlservpp::Data {

     using std::string_view_literals::operator""sv;

     /*
      * CONSTANTS
      */


     inline constexpr auto hdpModelName{"HDPModel"sv};

     inline constexpr auto ldaModelName{"LDAModel"sv};

     inline constexpr auto defaultNumberOfInitialTopics{2};

     inline constexpr auto defaultAlpha{0.1F};

     inline constexpr auto defaultEta{0.01F};


     inline constexpr auto defaultGamma{0.1F};

     inline constexpr auto defaultOptimizationInterval{10};

     inline constexpr auto modelFileHead{"LDA\0\0"sv};

     inline constexpr auto modelFileTermWeightingLen{5};

     inline constexpr auto modelFileTermWeightingOne{"one\0\0"sv};

     inline constexpr auto modelFileTermWeightingIdf{"idf\0\0"sv};

     inline constexpr auto modelFileType{"TPTK"sv};


     /*
      * DECLARATION
      */


     class TopicModel {
         // for convenience
         using TopicModelInfo = Struct::TopicModelInfo;

         using HDPModel = tomoto::HDPModel<tomoto::TermWeight::one, tomoto::RandGen>;
         using HDPModelIDF = tomoto::HDPModel<tomoto::TermWeight::idf, tomoto::RandGen>;
         using LDAModel = tomoto::LDAModel<tomoto::TermWeight::one, tomoto::RandGen>;
         using LDAModelIDF = tomoto::LDAModel<tomoto::TermWeight::idf, tomoto::RandGen>;

         using FoRelevance = tomoto::label::FoRelevance;
         using ITopicModel = tomoto::ITopicModel;
         using PMIExtractor = tomoto::label::PMIExtractor;

     public:

         [[nodiscard]] std::size_t getNumberOfDocuments() const;
         [[nodiscard]] std::unordered_map<std::string, std::size_t> getDocuments() const;
         [[nodiscard]] std::size_t getVocabularySize() const;
         [[nodiscard]] std::size_t getOriginalVocabularySize() const;
         [[nodiscard]] const std::vector<std::string>& getVocabulary() const;
         [[nodiscard]] std::size_t getNumberOfTokens() const;
         [[nodiscard]] std::size_t getBurnInIterations() const;
         [[nodiscard]] std::size_t getIterations() const;
         [[nodiscard]] std::size_t getParameterOptimizationInterval() const;
         [[nodiscard]] std::size_t getRandomNumberGenerationSeed() const;
         [[nodiscard]] std::string_view getModelName() const;
         [[nodiscard]] std::string_view getTermWeighting() const;
         [[nodiscard]] std::size_t getDocumentId(const std::string& name) const;
         [[nodiscard]] std::vector<std::string> getRemovedTokens() const;
         [[nodiscard]] std::size_t getNumberOfTopics() const;
         [[nodiscard]] std::vector<std::size_t> getTopics() const;
         [[nodiscard]] std::vector<std::pair<std::size_t, std::uint64_t>> getTopicsSorted() const;
         [[nodiscard]] double getLogLikelihoodPerToken() const;
         [[nodiscard]] double getTokenEntropy() const;
         [[nodiscard]] std::vector<std::pair<std::string, float>> getTopicTopNTokens(
                 std::size_t topic,
                 std::size_t n
         ) const;
         [[nodiscard]] std::vector<std::pair<std::string, float>> getTopicTopNLabels(
                 std::size_t topic,
                 std::size_t n
         ) const;
         [[nodiscard]] std::vector<std::pair<std::string, std::vector<float>>> getDocumentsTopics(
                 std::unordered_set<std::string>& done
         ) const;
         [[nodiscard]] std::vector<std::vector<float>> getDocumentsTopics(
                 const std::vector<std::vector<std::string>>& documents,
                 std::size_t maxIterations,
                 std::size_t numberOfWorkers
         ) const;
         [[nodiscard]] TopicModelInfo getModelInfo() const;


         void setFixedNumberOfTopics(std::size_t k);
         void setUseIdf(bool idf);
         void setBurnInIteration(std::size_t skipIterations);
         void setTokenRemoval(
                 std::size_t collectionFrequency,
                 std::size_t documentFrequency,
                 std::size_t fixedNumberOfTopTokens
         );
         void setInitialParameters(
                 std::size_t initialTopics,
                 float alpha,
                 float eta,
                 float gamma
         );
         void setParameterOptimizationInterval(std::size_t interval);
         void setRandomNumberGenerationSeed(std::size_t newSeed);
         void setLabelingOptions(
                 bool activate,
                 std::size_t minCf,
                 std::size_t minDf,
                 std::size_t minLength,
                 std::size_t maxLength,
                 std::size_t maxCandidates,
                 float smoothing,
                 float mu,
                 std::size_t windowSize
         );


         void addDocument(
                 const std::string& name,
                 const std::vector<std::string>& tokens,
                 std::size_t firstToken,
                 std::size_t numTokens
         );
         void startTraining();
         void train(
                 std::size_t iterations,
                 std::size_t threads
         );
         void label(std::size_t threads);


         std::size_t load(const std::string& fileName);
         std::size_t save(const std::string& fileName, bool full) const; //NOLINT(modernize-use-nodiscard)


         void clear(bool labelingOptions);


         MAIN_EXCEPTION_CLASS();

     private:
         // models
         std::unique_ptr<HDPModel> hdpModel;
         std::unique_ptr<HDPModelIDF> hdpModelIdf;
         std::unique_ptr<LDAModel> ldaModel;
         std::unique_ptr<LDAModelIDF> ldaModelIdf;

         // document names
         std::vector<std::string> docNames;

         // state
         bool hasDocs{false};
         bool isPrepared{false};
         std::size_t workersUsed{};

         // settings
         std::size_t fixedNumberOfTopics{};
         bool isUseIdf{false};
         std::size_t numberOfInitialTopics{defaultNumberOfInitialTopics};
         float initialAlpha{defaultAlpha};
         float initialEta{defaultEta};
         float initialGamma{defaultGamma};
         std::size_t seed{std::random_device{}()};
         std::size_t minTokenCf{};
         std::size_t minTokenDf{};
         std::size_t removeTopNTokens{};
         std::size_t optimizationInterval{defaultOptimizationInterval};
         std::string trainedWithVersion{};

         // labeling
         std::unique_ptr<FoRelevance> labeler;
         bool isLabeling{false};
         std::size_t labelingMinCf{};
         std::size_t labelingMinDf{};
         std::size_t labelingMinLength{};
         std::size_t labelingMaxLength{};
         std::size_t labelingMaxCandidates{};
         float labelingSmoothing{};
         float labelingMu{};
         std::size_t labelingWindowSize{};

         // internal helper functions
         void initModel(bool& isHdpTo, bool& isIdfTo);
         [[nodiscard]] std::string dictLookUp(tomoto::Vid tokenId) const;

         void checkModel(
                 const std::string& function,
                 bool& isHdpTo,
                 bool& isIdfTo
         ) const;
         void checkNoModel(
                 const std::string& function,
                 const std::string& errorMsg
         ) const;
         void checkTrained(const std::string& function) const;
         void checkNotTrained(
                 const std::string& function,
                 const std::string& errorMsg
         ) const;

         [[nodiscard]] const tomoto::Dictionary& getDict(
                 bool isHdp,
                 bool isIdf
         ) const;
         [[nodiscard]] std::size_t getLiveK(bool isIdf) const;
         [[nodiscard]] std::size_t getK(bool isHdp, bool isIdf) const;
         [[nodiscard]] bool isLiveTopic(bool isIdf, std::size_t topic) const;
         [[nodiscard]] float getGamma(bool isIdf) const;
         [[nodiscard]] std::size_t getNumberOfTables(bool isIdf) const;

         void prepareModel(bool isHdp, bool isIdf);
         void trainModel(
                 bool isHdp,
                 bool isIdf,
                 std::size_t iterations,
                 std::size_t
                 threads
         );
         void loadModelInformation(
                 bool isHdp,
                 bool isIdf,
                 const std::vector<std::uint8_t>& data
         );
         void writeModelInformation(
                 bool isHdp,
                 bool isIdf,
                 std::vector<std::uint8_t>& dataTo
         ) const;

         [[nodiscard]] std::vector<float> getInferredTopics(
                 bool isHdp,
                 bool isIdf,
                 const tomoto::DocumentBase * doc
         ) const;

         [[nodiscard]] const void * get(bool isHdp, bool isIdf) const;

         // internal static helper functions (definitions only)
         [[nodiscard]] static tomoto::RawDoc createDocument(
                 const std::string& name,
                 const std::vector<std::string>& tokens,
                 std::size_t firstToken,
                 std::size_t numTokens
         );
         static void readModelFileHead(std::istream& in, const std::string& fileName);
         static void readModelFileTermWeighting(
                 std::istream& in,
                 const std::string& fileName,
                 bool& isIdfTo
         );
         static void readModelFileType(std::istream& in, const std::string& fileName);
         static void resetStream(std::istream& in);
         static void numberFromDict(
                 const PickleDict& dict,
                 const std::string& key,
                 std::size_t& valueTo
         );
         static void floatFromDict(
                 const PickleDict& dict,
                 const std::string& key,
                 float& valueTo
         );
         static void stringFromDict(
                 const PickleDict& dict,
                 const std::string& key,
                 std::string& valueTo
         );

         static void validateLastResults(
                 std::vector<std::pair<std::string, std::vector<float>>>& results,
                 std::unordered_set<std::string>& done,
                 const std::unordered_set<std::string>::const_iterator& inserted
         );

         // internal static helper functions (constexpr and templates)
         [[nodiscard]] static constexpr std::string_view termWeightToString(bool isIdf) {
             if(isIdf) {
                 return "TermWeight.IDF";
             }

             return "TermWeight.ONE";
         }

         template<typename T> [[nodiscard]] static bool bytesEqual(
                 const T& bytes,
                 std::string_view s
         ) {
             if(bytes.size() != s.size()) {
                 return false;
             }

             for(std::size_t index{}; index < bytes.size(); ++index) {
                 if(bytes[index] != s[index]) { //NOLINT(cppcoreguidelines-pro-bounds-constant-array-index)
                     return false;
                 }
             }

             return true;
         }

         template<typename T> [[nodiscard]] static std::string bytesToString(
                 const T& bytes
         ) {
             std::string result;

             for(const auto c : bytes) {
                 if(c != '\0') {
                     result.push_back(c);
                 }
             }

             return result;
         }

         template<tomoto::TermWeight _tw, typename _RandGen>
         [[nodiscard]] static std::vector<float> removeDeadTopics(
                 const std::vector<float>& results,
                 const std::unique_ptr<tomoto::HDPModel<_tw, _RandGen>>& model
         ) {
             std::vector<float> filtered;

             filtered.reserve(results.size());

             for(std::size_t topic{}; topic < results.size(); ++topic) {
                 if(model->isLiveTopic(topic)) {
                     filtered.push_back(results[topic]);
                 }
             }

             return filtered;
         }
     };

     /*
      * IMPLEMENTATION
      */

     /*
      * GETTERS
      */


     inline std::size_t TopicModel::getNumberOfDocuments() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getNumberOfDocuments", isHdp, isIdf);
         this->checkTrained("getNumberOfDocuments");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getNumDocs);
     }


     inline std::unordered_map<std::string, std::size_t> TopicModel::getDocuments() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getDocuments", isHdp, isIdf);
         this->checkTrained("getDocuments");

         std::unordered_map<std::string, std::size_t> result;

         for(std::size_t index{}; index < this->getNumberOfDocuments(); ++index) {
             const tomoto::DocumentBase * docPtr{nullptr};

             //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
             DATA_TOPICMODEL_RETRIEVE(docPtr, isHdp, isIdf, getDoc, index);

             if(!(docPtr->docUid.empty())) {
                 result[docPtr->docUid] = index;
             }
         }

         return result;
     }


     inline std::size_t TopicModel::getVocabularySize() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getVocabularySize", isHdp, isIdf);
         this->checkTrained("getVocabularySize");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getV);
     }


     inline std::size_t TopicModel::getOriginalVocabularySize() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getOriginalVocabularySize", isHdp, isIdf);
         this->checkTrained("getOriginalVocabularySize");

         return this->getDict(isHdp, isIdf).size();
     }


     inline const std::vector<std::string>& TopicModel::getVocabulary() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getVocabulary", isHdp, isIdf);
         this->checkTrained("getVocabulary");

         return this->getDict(isHdp, isIdf).getRaw();
     }


     inline std::size_t TopicModel::getNumberOfTokens() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getNumberOfTokens", isHdp, isIdf);
         this->checkTrained("getNumberOfTokens");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getN);
     }


     inline std::size_t TopicModel::getBurnInIterations() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getBurnInIterations", isHdp, isIdf);
         this->checkTrained("getBurnInIterations");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getBurnInIteration);
     }


     inline std::size_t TopicModel::getIterations() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getIterations", isHdp, isIdf);
         this->checkTrained("getIterations");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getGlobalStep);
     }


     inline std::size_t TopicModel::getParameterOptimizationInterval() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getParameterOptimizationInterval", isHdp, isIdf);
         this->checkTrained("getParameterOptimizationInterval");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getOptimInterval);
     }


     inline std::size_t TopicModel::getRandomNumberGenerationSeed() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getRandomNumberGenerationSeed", isHdp, isIdf);
         this->checkTrained("getRandomNumberGenerationSeed");

         return this->seed;
     }


     inline std::string_view TopicModel::getModelName() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getModelName", isHdp, isIdf);

         if(isHdp) {
             return hdpModelName;
         }

         return ldaModelName;
     }


     inline std::string_view TopicModel::getTermWeighting() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getTermWeighting", isHdp, isIdf);

         return TopicModel::termWeightToString(isIdf);
     }


     inline std::size_t TopicModel::getDocumentId(const std::string& name) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getDocumentId", isHdp, isIdf);

         std::size_t id{};

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE(id, isHdp, isIdf, getDocIdByUid, name);

         if(id == std::numeric_limits<std::size_t>::max()) {
             throw Exception(
                     "getDocumentId():"
                     " No document named '"
                     + name
                     + "' has been added to the model"
             );
         }

         return id;
     }


     inline std::vector<std::string> TopicModel::getRemovedTokens() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getRemovedTokens", isHdp, isIdf);
         this->checkTrained("getRemovedTokens");

         const auto& dict{
             this->getDict(isHdp, isIdf)
         };
         const auto& size{dict.size()};
          std::vector<std::string> removed;

         for(auto tokendIndex{size - this->removeTopNTokens}; tokendIndex < size; ++tokendIndex) {
             removed.emplace_back(dict.toWord(tokendIndex));
         }

         return removed;
     }


     inline std::size_t TopicModel::getNumberOfTopics() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getNumberOfTopics", isHdp, isIdf);
         this->checkTrained("getNumberOfTopics");

         if(isHdp) {
             return this->getLiveK(isIdf);
         }

         return this->fixedNumberOfTopics;
     }


     inline std::vector<std::size_t> TopicModel::getTopics() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getTopics", isHdp, isIdf);
         this->checkTrained("getTopics");

         std::vector<std::size_t> topicIds;
         std::size_t maxK{};

         if(isHdp) {
             topicIds.reserve(this->getLiveK(isIdf));

             maxK = this->getK(true, isIdf);

             for(std::size_t k{}; k < maxK; ++k) {
                 if(this->isLiveTopic(isIdf, k)) {
                     topicIds.emplace_back(k);
                 }
             }
         }
         else {
             topicIds.reserve(this->fixedNumberOfTopics);

             for(std::size_t k{}; k < this->fixedNumberOfTopics; ++k) {
                 topicIds.emplace_back(k);
             }
         }

         return topicIds;
     }


     inline std::vector<std::pair<std::size_t, std::uint64_t>> TopicModel::getTopicsSorted() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getTopicsSorted", isHdp, isIdf);
         this->checkTrained("getTopicsSorted");

         std::vector<std::pair<std::size_t, std::uint64_t>> topics;
         std::vector<std::uint64_t> counts;

         topics.reserve(this->getK(isHdp, isIdf));

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE_NOARGS(counts, isHdp, isIdf, getCountByTopic);

         std::size_t topicIndex{};

         for(const auto count : counts) {
             if(!isHdp || this->isLiveTopic(isIdf, topicIndex)) {
                 topics.emplace_back(topicIndex, count);
             }

             ++topicIndex;
         }

         std::sort(topics.begin(), topics.end(), [](const auto& a, const auto& b) {
             return a.second > b.second;
         });

         return topics;
     }


     inline double TopicModel::getLogLikelihoodPerToken() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getLogLikelihoodPerToken", isHdp, isIdf);
         this->checkTrained("getLogLikelihoodPerToken");

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getLLPerWord);
     }


     inline double TopicModel::getTokenEntropy() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getTokenEntropy", isHdp, isIdf);
         this->checkTrained("getTokenEntropy");

         std::vector<std::uint64_t> vocabularyFrequencies;
         std::uint64_t vocabularyUsed{};

         // retrieve vocabulary frequencies
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE_NOARGS(vocabularyFrequencies, isHdp, isIdf, getVocabCf);
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE_NOARGS(vocabularyUsed, isHdp, isIdf, getV);

         // sum up for normalization
         const auto frequencySum{
             std::accumulate(
                     vocabularyFrequencies.begin(),
                     vocabularyFrequencies.begin() + vocabularyUsed,
                     std::uint64_t{}
             )
         };

         std::vector<double> normalizedFrequencies;

         normalizedFrequencies.reserve(vocabularyUsed);

         for(
                 auto it{vocabularyFrequencies.begin()};
                 it < vocabularyFrequencies.begin() + vocabularyUsed;
                 ++it
         ) {
             normalizedFrequencies.push_back(static_cast<double>(*it) / frequencySum);
         }

         return std::accumulate(
                 normalizedFrequencies.begin(),
                 normalizedFrequencies.end(),
                 0.,
                 [](double a, double b) {
                     return a + b * std::log(b);
                 }
         );
     }


     inline std::vector<std::pair<std::string, float>> TopicModel::getTopicTopNTokens(
             std::size_t topic,
             std::size_t n
     ) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getTopicTopNTokens", isHdp, isIdf);
         this->checkTrained("getTopicTopNTokens");

         std::vector<std::pair<tomoto::Vid, float>> tokenIds;

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE(tokenIds, isHdp, isIdf, getWidsByTopicSorted, topic, n);

         std::vector<std::pair<std::string, float>> tokens;

         tokens.reserve(n);

         for(const auto& tokenId : tokenIds) {
             tokens.emplace_back(this->dictLookUp(tokenId.first), tokenId.second);
         }

         return tokens;
     }


     inline std::vector<std::pair<std::string, float>> TopicModel::getTopicTopNLabels(
             std::size_t topic,
             std::size_t n
     ) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getTopicTopNLabels", isHdp, isIdf);
         this->checkTrained("getTopicTopNLabels");

         if(n == 0) {
             return std::vector<std::pair<std::string, float>>{};
         }

         if(!(this->labeler)) {
             throw Exception(
                     "getTopicTopNLabels():"
                     " Topics have not been labeled"
             );
         }

         return this->labeler->getLabels(topic, n);
     }


     inline std::vector<std::pair<std::string, std::vector<float>>> TopicModel::getDocumentsTopics(
             std::unordered_set<std::string>& done
     ) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getDocumentsTopics", isHdp, isIdf);
         this->checkTrained("getDocumentsTopics");

         std::vector<std::pair<std::string, std::vector<float>>> results;
         const auto total{this->getNumberOfDocuments()};

         for(std::size_t docId{}; docId < total; ++docId) {
             const tomoto::DocumentBase * doc{nullptr};

             DATA_TOPICMODEL_RETRIEVE(doc, isHdp, isIdf, getDoc, docId);

             if(doc->docUid.empty()) {
                 continue;
             }

             const auto inserted{done.insert(doc->docUid)};

             if(inserted.second) {
                 results.emplace_back(
                         doc->docUid,
                         this->getInferredTopics(isHdp, isIdf, doc)
                 );

                 // remove last results if all values are NaN
                 TopicModel::validateLastResults(results, done, inserted.first);
             }
         }

         return results;
     }


     inline std::vector<std::vector<float>> TopicModel::getDocumentsTopics(
             const std::vector<std::vector<std::string>>& documents,
             std::size_t maxIterations,
             std::size_t numberOfWorkers
     ) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getDocumentsTopics", isHdp, isIdf);
         this->checkTrained("getDocumentsTopics");

         // create documents
         std::vector<std::unique_ptr<tomoto::DocumentBase>> docUPtrs(documents.size());
         std::size_t docIndex{};

         for(const auto& tokens : documents) {
             //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
             DATA_TOPICMODEL_RETRIEVE(
                     docUPtrs[docIndex],
                     isHdp,
                     isIdf,
                     makeDoc,
                     TopicModel::createDocument(
                             "doc" + std::to_string(docIndex),
                             tokens,
                             0,
                             tokens.size()
                     )
             );

             if(!(docUPtrs[docIndex])) {
                 throw Exception(
                         "getDocumentsTopics():"
                         " Could not create document 'doc"
                         + std::to_string(docIndex)
                         + "'"
                 );
             }

             ++docIndex;
         }

         // get C-style pointers for underlying API
         std::vector<tomoto::DocumentBase *> docPtrs(documents.size(), nullptr);

         std::transform(docUPtrs.begin(), docUPtrs.end(), docPtrs.begin(), [](const auto& uPtr) {
             return uPtr.get();
         });

         // infer topic distributions for documents
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_CALL(
                 isHdp,
                 isIdf,
                 infer,
                 docPtrs,
                 maxIterations,
                 -1.F, /* currently not used */
                 numberOfWorkers,
                 tomoto::ParallelScheme::default_,
                 false
         );

         std::vector<std::vector<float>> results;

         results.reserve(documents.size());

         for(const auto * doc : docPtrs) {
             results.emplace_back(this->getInferredTopics(isHdp, isIdf, doc));
         }

         return results;
     }


     inline Struct::TopicModelInfo TopicModel::getModelInfo() const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("getModelInfo", isHdp, isIdf);
         this->checkTrained("getModelInfo");

         TopicModelInfo information;

         information.modelName = this->getModelName();
         information.modelVersion = Helper::Versions::getTomotoVersion();
         information.numberOfDocuments = this->getNumberOfDocuments();
         information.numberOfTokens = this->getNumberOfTokens();
         information.sizeOfVocabulary = this->getOriginalVocabularySize();
         information.sizeOfVocabularyUsed = this->getVocabularySize();
         information.tokenEntropy = this->getTokenEntropy();
         information.removedTokens = this->getRemovedTokens();
         information.numberOfIterations = this->getIterations();
         information.numberOfBurnInSteps = this->getBurnInIterations();
         information.optimizationInterval = this->getParameterOptimizationInterval();
         information.logLikelihoodPerToken = this->getLogLikelihoodPerToken();
         information.weighting = this->getTermWeighting();
         information.minCollectionFrequency = this->minTokenCf;
         information.minDocumentFrequency = this->minTokenDf;
         information.numberOfTopTokensToBeRemoved = this->removeTopNTokens;
         information.initialAlpha = this->initialAlpha;
         information.initialEta = this->initialEta;
         information.seed = this->seed;
         information.trainedWithVersion = this->trainedWithVersion;
         information.numberOfTopics = this->getNumberOfTopics();

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE_NOARGS(information.alpha, isHdp, isIdf, getAlpha);
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE_NOARGS(information.eta, isHdp, isIdf, getEta);

         if(isHdp) {
             information.numberOfInitialTopics = this->numberOfInitialTopics;
             information.gamma = this->getGamma(isIdf);
             information.initialGamma = this->initialGamma;
             information.numberOfTables = this->getNumberOfTables(isIdf);
         }
         else {
             // get alpha for each topic (LDA only)
             information.alphas.reserve(information.numberOfTopics);

             for(std::size_t topic{}; topic < information.numberOfTopics; ++topic) {
                 if(isIdf) {
                     information.alphas.push_back(this->ldaModelIdf->getAlpha(topic));
                 }
                 else {
                     information.alphas.push_back(this->ldaModel->getAlpha(topic));
                 }
             }
         }

         return information;
     }

     /*
      * SETTERS
      */


     inline void TopicModel::setFixedNumberOfTopics(std::size_t k) {
         this->checkNoModel(
                 "setFixedNumberOfTopics",
                 "Fixed number of topics cannot be set"
         );

         this->fixedNumberOfTopics = k;
     }


     inline void TopicModel::setUseIdf(bool idf) {
         this->checkNoModel(
                 "setUseIdf",
                 "Term weighting cannot be set to IDF"
         );

         this->isUseIdf = idf;
     }


     inline void TopicModel::setBurnInIteration(std::size_t skipIterations) {
         bool isHdp{false};
         bool isIdf{false};

         this->initModel(isHdp, isIdf);
         this->checkNotTrained(
                 "setBurnInIteration",
                 "Iterations cannot be burned"
         );

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_CALL(isHdp, isIdf, setBurnInIteration, skipIterations);
     }


     inline void TopicModel::setTokenRemoval(
             std::size_t collectionFrequency,
             std::size_t documentFrequency,
             std::size_t fixedNumberOfTopTokens
     ) {
         this->checkNotTrained(
                 "setTokenRemoval",
                 "Stopword settings cannot be changed"
         );

         this->minTokenCf = collectionFrequency;
         this->minTokenDf = documentFrequency;
         this->removeTopNTokens = fixedNumberOfTopTokens;
     }


     inline void TopicModel::setInitialParameters(
             std::size_t initialTopics,
             float alpha,
             float eta,
             float gamma
     ) {
         this->checkNoModel(
                 "setInitialParameters",
                 "Cannot set initial parameters"
         );

         this->numberOfInitialTopics = initialTopics;
         this->initialAlpha = alpha;
         this->initialEta = eta;
         this->initialGamma = gamma;
     }


     inline void TopicModel::setParameterOptimizationInterval(std::size_t interval) {
         this->checkNoModel(
                 "setParameterOptimizationInterval",
                 "Cannot set parameter optimization interval"
         );

         this->optimizationInterval = interval;
     }


     inline void TopicModel::setRandomNumberGenerationSeed(std::size_t newSeed) {
         this->checkNoModel(
                 "setRandomNumberGenerationSeed",
                 "Cannot set seed for random number generation"
         );

         this->seed = newSeed;
     }


     inline void TopicModel::setLabelingOptions(
             bool activate,
             std::size_t minCf,
             std::size_t minDf,
             std::size_t minLength,
             std::size_t maxLength,
             std::size_t maxCandidates,
             float smoothing,
             float mu,
             std::uint64_t windowSize
     ) {
         this->isLabeling = activate;
         this->labelingMinCf = minCf;
         this->labelingMinDf = minDf;
         this->labelingMinLength = minLength;
         this->labelingMaxLength = maxLength;
         this->labelingMaxCandidates = maxCandidates;
         this->labelingSmoothing = smoothing;
         this->labelingMu = mu;
         this->labelingWindowSize = windowSize;

         // re-label if necessary
         if(this->labeler) {
             this->label(this->workersUsed);
         }
     }

     /*
      * TOPIC MODELLING
      */


     inline void TopicModel::addDocument(
         const std::string& name,
         const std::vector<std::string>& tokens,
         std::size_t firstToken,
         std::size_t numTokens
     ) {
         bool isHdp{false};
         bool isIdf{false};

         this->initModel(isHdp, isIdf);
         this->checkNotTrained(
                 "addDocument",
                 "Documents cannot be added"
         );

         // add name
         this->docNames.emplace_back(name);

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_CALL(
                 isHdp,
                 isIdf,
                 addDoc,
                 TopicModel::createDocument(
                         this->docNames.back(),
                         tokens,
                         firstToken,
                         numTokens
                 )
         );

         if(!(this->hasDocs)) {
             this->hasDocs = numTokens > 0;
         }
     }


     inline void TopicModel::startTraining() {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("startTraining", isHdp, isIdf);
         this->prepareModel(isHdp, isIdf);
         this->trainModel(isHdp, isIdf, 0, 1);

         this->trainedWithVersion = Helper::Versions::getTomotoVersion();
     }


     inline void TopicModel::train(
             std::size_t iterations,
             std::size_t threads
     ) {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("train", isHdp, isIdf);
         this->prepareModel(isHdp, isIdf);
         this->trainModel(isHdp, isIdf, iterations, threads);
     }


     inline void TopicModel::label(std::size_t threads) {
         if(!(this->isLabeling)) {
             this->labeler.reset();

             return;
         }

         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("label", isHdp, isIdf);
         this->checkTrained("label");

         this->workersUsed = threads;

         // extract topic label candidates
         PMIExtractor extractor(
                 this->labelingMinCf,
                 this->labelingMinDf,
                 this->labelingMinLength,
                 this->labelingMaxLength,
                 this->labelingMaxCandidates
         );

         const auto * interfacePtr{
             static_cast<const ITopicModel *>(this->get(isHdp, isIdf))
         };

         auto labelCandidates{extractor.extract(interfacePtr)};

         // create labeler
         constexpr auto LAMBDA{0.2F};

         this->labeler = std::make_unique<FoRelevance>(
                 interfacePtr,
                 labelCandidates.begin(),
                 labelCandidates.end(),
                 this->labelingMinDf,
                 this->labelingSmoothing,
                 LAMBDA, /* not used yet */
                 this->labelingMu,
                 this->labelingWindowSize == 0  ?
                     std::numeric_limits<std::size_t>::max()
                     : this->labelingWindowSize,
                 threads
         );
     }

     /*
      * LOAD AND SAVE
      */


     inline size_t TopicModel::load(const std::string& fileName) {
         this->clear(false);

         bool isHdp{false};
         bool isIdf{false};

         // open the file
         std::ifstream in(fileName.c_str(), std::ios::binary);

         if(!in.is_open()) {
             throw Exception(
                     "TopicModel::load():"
                     " Could not read from '"
                     + fileName
                     + "'"
             );
         }

         // read the file head (= model type)
         TopicModel::readModelFileHead(in, fileName);

         // read and set the term weighting scheme
         TopicModel::readModelFileTermWeighting(in, fileName, isIdf);

         this->setUseIdf(isIdf);

         // read the file type
         TopicModel::readModelFileType(in, fileName);

         // return to the beginning of the file
         TopicModel::resetStream(in);

         // initialize and load the model
         std::vector<uint8_t> data;

         this->initModel(isHdp, isIdf);

         try {
             //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
             DATA_TOPICMODEL_CALL(
                     isHdp,
                     isIdf,
                     loadModel,
                     in,
                     &data
             );
         }
         catch(...) {
             // if loading of the model failed, clear it and try another algorithm
             this->clear(false);

             if(isHdp) { /* if the algorithm was set to HDP, set it to LDA */
                 this->fixedNumberOfTopics = defaultNumberOfInitialTopics;
             }

             // return to the beginning of the file
             TopicModel::resetStream(in);

             // initialize and load the model
             this->initModel(isHdp, isIdf);

             //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
             DATA_TOPICMODEL_CALL(
                     isHdp,
                     isIdf,
                     loadModel,
                     in,
                     &data
             );
         }

         // get number of bytes (best guess)
         const auto bytesRead{in.tellg()};

         // close the file
         in.close();

         // retrieve additional information about the loaded model
         this->loadModelInformation(isHdp, isIdf, data);

         return bytesRead;
     }


     inline std::size_t TopicModel::save(const std::string& fileName, bool full) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("save", isHdp, isIdf);
         this->checkTrained("save");

         // open file to write model to
         std::ofstream out(fileName.c_str(), std::ios::binary);

         if(!out.is_open()) {
             throw Exception(
                     "TopicModel::save():"
                     " Could not write to '"
                     + fileName
                     + "'"
             );
         }

         // add additional information to the saved model
         std::vector<uint8_t> data;

         this->writeModelInformation(isHdp, isIdf, data);

         // write model to file
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_CALL(isHdp, isIdf, saveModel, out, full, &data);

         // get number of written bytes (best guess)
         const auto bytesWritten{out.tellp()};

         // close file
         out.close();

         return bytesWritten;
     }

     /*
      * CLEANUP
      */


     inline void TopicModel::clear(bool labelingOptions) {
         this->hdpModel.reset();
         this->hdpModelIdf.reset();
         this->ldaModel.reset();
         this->ldaModelIdf.reset();

         Helper::Memory::free(this->docNames);

         this->hasDocs = false;
         this->isPrepared = false;

         this->fixedNumberOfTopics = 0;
         this->numberOfInitialTopics = defaultNumberOfInitialTopics;
         this->initialAlpha = defaultAlpha;
         this->initialEta = defaultEta;
         this->initialGamma = defaultGamma;
         this->seed = std::random_device{}();
         this->minTokenCf = 0;
         this->minTokenDf = 0;
         this->removeTopNTokens = 0;
         this->optimizationInterval = defaultOptimizationInterval;

         this->trainedWithVersion.clear();

         this->labeler.reset();

         if(labelingOptions) {
             this->isLabeling = false;
             this->labelingMinCf = 0;
             this->labelingMinDf = 0;
             this->labelingMinLength = 0;
             this->labelingMaxLength = 0;
             this->labelingMaxCandidates = 0;
             this->labelingSmoothing = 0.F;
             this->labelingMu = 0.F;
             this->labelingWindowSize = 0;
         }
     }

     /*
      * INTERNAL HELPER FUNCTIONS (private)
      */

     // initialize model
     inline void TopicModel::initModel(bool& isHdpTo, bool& isIdfTo) {
         if(
                 !(this->hdpModel)
                 && !(this->hdpModelIdf)
                 && !(this->ldaModel)
                 && !(this->ldaModelIdf)
         ) {
             if(this->fixedNumberOfTopics == 0) {
                 if(this->isUseIdf) {
                     this->hdpModelIdf = std::make_unique<HDPModelIDF>(
                             this->numberOfInitialTopics,
                             this->initialAlpha,
                             this->initialEta,
                             this->initialGamma,
                             this->seed
                     );
                 }
                 else {
                     this->hdpModel = std::make_unique<HDPModel>(
                             this->numberOfInitialTopics,
                             this->initialAlpha,
                             this->initialEta,
                             this->initialGamma,
                             this->seed
                     );
                 }
             }
             else if(this->isUseIdf) {
                 this->ldaModelIdf = std::make_unique<LDAModelIDF>(
                         this->fixedNumberOfTopics,
                         this->initialAlpha,
                         this->initialEta,
                         this->seed
                 );
             }
             else {
                 this->ldaModel = std::make_unique<LDAModel>(
                         this->fixedNumberOfTopics,
                         this->initialAlpha,
                         this->initialEta,
                         this->seed
                 );
             }
         }

         if(this->hdpModel) {
             isHdpTo = true;
             isIdfTo = false;
         }
         else if(this->hdpModelIdf) {
             isHdpTo = true;
             isIdfTo = true;
         }
         else if(this->ldaModel) {
             isHdpTo = false;
             isIdfTo = false;
         }
         else if(this->ldaModelIdf){
             isHdpTo = false;
             isIdfTo = true;
         }
         else {
             throw Exception(
                     "TopicModel::initModel():"
                     " No model has been loaded."
             );
         }
     }

     // look up token ID in dictionary
     inline std::string TopicModel::dictLookUp(tomoto::Vid tokenId) const {
         bool isHdp{false};
         bool isIdf{false};

         this->checkModel("dictLookUp", isHdp, isIdf);
         this->checkTrained("dictLookUp");

         std::string result;

         return this->getDict(isHdp, isIdf).toWord(tokenId);
     }

     // check model
     inline void TopicModel::checkModel(
             const std::string& function,
             bool& isHdpTo,
             bool& isIdfTo
     ) const {
         if(this->hasDocs) {
             if(this->hdpModel) {
                 isHdpTo = true;
                 isIdfTo = false;

                 return;
             }

             if(this->hdpModelIdf) {
                 isHdpTo = true;
                 isIdfTo = true;

                 return;
             }

             if(this->ldaModel) {
                 isHdpTo = false;
                 isIdfTo = false;

                 return;
             }

             if(this->ldaModelIdf) {
                 isHdpTo = false;
                 isIdfTo = true;

                 return;
             }
         }

         throw Exception(
                 "TopicModel::"
                 + function
                 +   "(): No documents have been added"
                     " or the model has already been cleared"
         );
     }

     // check whether model has not been initialized
     inline void TopicModel::checkNoModel(
             const std::string& function,
             const std::string& errorMsg
     ) const {
         if(
                 this->hdpModel
                 || this->hdpModelIdf
                 || this->ldaModel
                 || this->ldaModelIdf
         ) {
             throw Exception(
                     "TopicModel::"
                     + function
                     + "(): "
                     + errorMsg
                     + " after the model has been initialized"
             );
         }
     }

     // check whether training has been started
     inline void TopicModel::checkTrained(const std::string& function) const {
         if(!(this->isPrepared)) {
             throw Exception(
                     "TopicModel::"
                     + function
                     + "(): The model has not yet been trained"
             );
         }
     }

     // check whether training has not yet been started
     inline void TopicModel::checkNotTrained(
             const std::string& function,
             const std::string& errorMsg
     ) const {
         if(this->isPrepared) {
             throw Exception(
                     "TopicModel::"
                     + function
                     + "(): "
                     + errorMsg
                     + " after the model has already been trained"
             );
         }
     }

     // get dictionary (without further checking)
     inline const tomoto::Dictionary& TopicModel::getDict(bool isHdp, bool isIdf) const {
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getVocabDict);
     }

     // get number of topics (without further checking)
     inline std::size_t TopicModel::getLiveK(bool isIdf) const {
         if(isIdf) {
             return this->hdpModelIdf->getLiveK();
         }

         return this->hdpModel->getLiveK();
     }

     // get number of topics (without further checking)
     inline std::size_t TopicModel::getK(bool isHdp, bool isIdf) const {
         if(!isHdp) {
             return this->fixedNumberOfTopics;
         }

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETURN(isHdp, isIdf, getK);
     }

     // check whether topic is alive (without additional checking)
     inline bool TopicModel::isLiveTopic(bool isIdf, std::size_t topic) const {
         if(isIdf) {
             return this->hdpModelIdf->isLiveTopic(topic);
         }

         return this->hdpModel->isLiveTopic(topic);
     }

     // get concentration coefficient of the Dirichlet Process for table-topic (without further checking)
     inline float TopicModel::getGamma(bool isIdf) const {
         if(isIdf) {
             return this->hdpModelIdf->getGamma();
         }

         return this->hdpModel->getGamma();
     }

     // get number of tables in the LDP model
     inline std::size_t TopicModel::getNumberOfTables(bool isIdf) const {
         if(isIdf) {
             return this->hdpModelIdf->getTotalTables();
         }

         return this->hdpModel->getTotalTables();
     }

     // prepare model (without further checking)
     inline void TopicModel::prepareModel(bool isHdp, bool isIdf) {
         if(!(this->isPrepared)) {
             //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
             DATA_TOPICMODEL_CALL(
                     isHdp,
                     isIdf,
                     prepare,
                     true,
                     this->minTokenCf,
                     this->minTokenDf,
                     this->removeTopNTokens
             );

             this->isPrepared = true;
         }
     }

     // train model (without further checking)
     inline void TopicModel::trainModel(bool isHdp, bool isIdf, std::size_t iterations, std::size_t threads) {
         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_CALL(
                 isHdp,
                 isIdf,
                 train,
                 iterations,
                 threads,
                 tomoto::ParallelScheme::default_
         );
     }

     // load model information after reading model from file
     inline void TopicModel::loadModelInformation(
             bool isHdp,
             bool isIdf,
             const std::vector<std::uint8_t>& data
     ) {
         // get model information from a dictionary generated by reading Python pickle data
         PickleDict dict(data);

         TopicModel::numberFromDict(dict, "min_cf", this->minTokenCf);
         TopicModel::numberFromDict(dict, "min_df", this->minTokenDf);
         TopicModel::numberFromDict(dict, "rm_top", this->removeTopNTokens);
         TopicModel::numberFromDict(dict, "initial_k", this->numberOfInitialTopics); /* HDP only*/
         TopicModel::numberFromDict(dict, "k", this->fixedNumberOfTopics); /* LDA only */
         TopicModel::numberFromDict(dict, "seed", this->seed);

         TopicModel::floatFromDict(dict, "alpha", this->initialAlpha);
         TopicModel::floatFromDict(dict, "eta", this->initialEta);
         TopicModel::floatFromDict(dict, "gamma", this->initialGamma); /* HDP only */

         TopicModel::stringFromDict(dict, "version", this->trainedWithVersion);

         // check whether model has been trained
         std::size_t iterations{};

         //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
         DATA_TOPICMODEL_RETRIEVE_NOARGS(iterations, isHdp, isIdf, getGlobalStep);

         this->hasDocs = true;

         if(iterations > 0) {
             this->isPrepared = true;

             this->startTraining();
         }
     }

     // write model information for writing the module to file
     inline void TopicModel::writeModelInformation(
             bool isHdp,
             bool isIdf,
             std::vector<std::uint8_t>& dataTo
     ) const {
         // fill dictionary with model information
         PickleDict dict;

         dict.setNumber(
                 "tw",
                 static_cast<std::int64_t>(
                         isIdf ? tomoto::TermWeight::idf : tomoto::TermWeight::one
                 )
         );

         dict.setNumber("min_cf", this->minTokenCf);
         dict.setNumber("min_df", this->minTokenDf);
         dict.setNumber("rm_top", this->removeTopNTokens);

         if(isHdp) {
             dict.setNumber("initial_k", this->numberOfInitialTopics);
         }
         else {
             dict.setNumber("k", this->fixedNumberOfTopics);
         }

         dict.setNumber("seed", this->seed);

         dict.setFloat("alpha", this->initialAlpha);
         dict.setFloat("eta", this->initialEta);

         if(isHdp) {
             dict.setFloat("gamma", this->initialGamma);
         }

         dict.setString("version", this->trainedWithVersion);

         // write dictionary as Python pickle data
         dict.writeTo(dataTo);
     }

     // get inferred topics from document pointer
     inline std::vector<float> TopicModel::getInferredTopics(
             bool isHdp,
             bool isIdf,
             const tomoto::DocumentBase * doc
     ) const {
         if(isHdp) {
             std::vector<float> topics;

             if(isIdf) {
                 return TopicModel::removeDeadTopics(
                         this->hdpModelIdf->getTopicsByDoc(
                                 *dynamic_cast<const tomoto::DocumentHDP<tomoto::TermWeight::idf> *>(doc)
                         ),
                         this->hdpModelIdf
                 );
             }

             return TopicModel::removeDeadTopics(
                     this->hdpModel->getTopicsByDoc(
                             *dynamic_cast<const tomoto::DocumentHDP<tomoto::TermWeight::one> *>(doc)
                     ),
                     this->hdpModel
             );
         }

         if(isIdf) {
             return this->ldaModelIdf->getTopicsByDoc(
                     *dynamic_cast<const tomoto::DocumentLDA<tomoto::TermWeight::idf> *>(doc)
             );
         }

         return this->ldaModel->getTopicsByDoc(
                 *dynamic_cast<const tomoto::DocumentLDA<tomoto::TermWeight::one> *>(doc)
         );
     }

     // get const pointer to the model used
     inline const void * TopicModel::get(bool isHdp, bool isIdf) const {
         if(isHdp) {
             if(isIdf) {
                 return this->hdpModelIdf.get();
             }

             return this->hdpModel.get();
         }

         if(isIdf) {
             return this->ldaModelIdf.get();
         }

         return this->ldaModel.get();
     }

     // create document for underlying API
     inline tomoto::RawDoc TopicModel::createDocument(
             const std::string& name,
             const std::vector<std::string>& tokens,
             std::size_t firstToken,
             std::size_t numTokens
     ) {
         tomoto::RawDoc doc;
         const auto documentEnd{firstToken + numTokens};

         doc.rawWords.reserve(numTokens);

         for(std::size_t tokenIndex{firstToken}; tokenIndex < documentEnd; ++tokenIndex) {
             doc.rawWords.emplace_back(tokens.at(tokenIndex));
         }

         // share document name
         doc.docUid = tomoto::SharedString(name);

         return doc;
     }

     // check first bytes of the topic model file (indicating the type of the model)
     inline void TopicModel::readModelFileHead(std::istream& in, const std::string& fileName) {
         std::array<char, modelFileHead.size()> headBytes{};

         in.read(headBytes.data(), modelFileHead.size());

         if(!TopicModel::bytesEqual(headBytes, modelFileHead)) {
             throw Exception(
                     "TopicModel::load():"
                     " Invalid model file or unsupported model type in '"
                     + fileName
                     + "' (first bytes do not match tomoto's LDA model format: '"
                     + TopicModel::bytesToString(headBytes)
                     + "')"
             );
         }
     }

     // check and read term weighting scheme from topic model file
     inline void TopicModel::readModelFileTermWeighting(std::istream& in, const std::string& fileName, bool& isIdfTo) {
         std::array<char, modelFileTermWeightingLen> twBytes{};

         in.read(twBytes.data(), modelFileTermWeightingLen);

         if(TopicModel::bytesEqual(twBytes, modelFileTermWeightingOne)) {
             isIdfTo = false;
         }
         else if(TopicModel::bytesEqual(twBytes, modelFileTermWeightingIdf)) {
             isIdfTo = true;
         }
         else {
             throw Exception(
                     "TopicModel::load():"
                     " Invalid model file or unsupported term weighting scheme in '"
                     + fileName
                     + "' (term weighting scheme does not match 'one' or 'idf':"
                     + TopicModel::bytesToString(twBytes)
                     + "')"
             );
         }
     }

     // check file type of topic model file
     inline void TopicModel::readModelFileType(std::istream& in, const std::string& fileName) {
         std::array<char, modelFileType.size()> typeBytes{};

         in.read(typeBytes.data(), modelFileType.size());

         if(!TopicModel::bytesEqual(typeBytes, modelFileType)) {
             throw Exception(
                     "TopicModel::load():"
                     " Invalid model file '"
                     + fileName
                     + "' (type does not match tomoto's model format: '"
                     + TopicModel::bytesToString(typeBytes)
                     + "')"
             );
         }
     }

     // reset an input stream and go back to its start
     inline void TopicModel::resetStream(std::istream& in) {
         in.clear();
         in.seekg(0, std::ios_base::beg);
     }

     // get number from Pickle dictionary, if available
     inline void TopicModel::numberFromDict(const PickleDict& dict, const std::string& key, std::size_t& valueTo) {
         const auto entry{
             dict.getNumber(key)
         };

         if(entry) {
             valueTo = static_cast<std::size_t>(*entry);
         }
         else {
             valueTo = 0;
         }
     }

     // get floating-point number from Pickle dictionary, if available
     inline void TopicModel::floatFromDict(const PickleDict& dict, const std::string& key, float& valueTo) {
         const auto entry{
             dict.getFloat(key)
         };

         if(entry) {
             valueTo = static_cast<float>(*entry);
         }
         else {
             valueTo = 0.F;
         }
     }

     // get string from Pickle dictionary, if available
     inline void TopicModel::stringFromDict(const PickleDict& dict, const std::string& key, std::string& valueTo) {
         auto entry{
             dict.getString(key)
         };

         if(entry) {
             valueTo = std::move(*entry);
         }
         else {
             Helper::Memory::free(valueTo);
         }
     }

     // validate the results added last, remove them if all values are NaN
     inline void TopicModel::validateLastResults(
             std::vector<std::pair<std::string, std::vector<float>>>& results,
             std::unordered_set<std::string>& done,
             const std::unordered_set<std::string>::const_iterator& inserted
     ) {
         if(
                 std::all_of(
                         results.back().second.begin(),
                         results.back().second.end(),
                         [](const auto value) {
                             return std::isnan(value);
                         }
                 )
         ) {
             results.pop_back();
             done.erase(inserted);
         }
     }

 } /* namespace crawlservpp::Data */

 #endif /* DATA_TOPICMODEL_HPP_ */
crawlservpp::Data::TopicModel::getDocuments
std::unordered_map< std::string, std::size_t > getDocuments() const
Gets a map with the documents and their indices from the model.
Definition: TopicModel.hpp:607

crawlservpp::Data::modelFileTermWeightingIdf
constexpr auto modelFileTermWeightingIdf
The term weighting scheme IDF (tf-idf) as saved in a model file.
Definition: TopicModel.hpp:216

crawlservpp::Struct::TopicModelInfo::minCollectionFrequency
std::size_t minCollectionFrequency
Minimum collection frequency of tokens.
Definition: TopicModelInfo.hpp:124

crawlservpp::Data::modelFileTermWeightingLen
constexpr auto modelFileTermWeightingLen
The number of bytes determining the term weighting scheme in a model file.
Definition: TopicModel.hpp:210

crawlservpp::Data::TopicModel::save
std::size_t save(const std::string &fileName, bool full) const
Writes the model to a file.
Definition: TopicModel.hpp:1904

crawlservpp::Data::TopicModel::getNumberOfTokens
std::size_t getNumberOfTokens() const
Gets the number of tokens after training has begun.
Definition: TopicModel.hpp:700

crawlservpp::Data::TopicModel::train
void train(std::size_t iterations, std::size_t threads)
Trains the underlying HLDA model.
Definition: TopicModel.hpp:1707

crawlservpp::Struct::TopicModelInfo::numberOfBurnInSteps
std::size_t numberOfBurnInSteps
The number of initially skipped, i.e. burn-in, steps.
Definition: TopicModelInfo.hpp:108

crawlservpp::Struct::TopicModelInfo::numberOfTopics
std::size_t numberOfTopics
The number of topics.
Definition: TopicModelInfo.hpp:171

crawlservpp::Struct::TopicModelInfo::tokenEntropy
double tokenEntropy
The entropy of tokens in the model.
Definition: TopicModelInfo.hpp:95

crawlservpp::Struct::TopicModelInfo::sizeOfVocabulary
std::size_t sizeOfVocabulary
Definition: TopicModelInfo.hpp:89

crawlservpp::Data::TopicModel::getTopicTopNLabels
std::vector< std::pair< std::string, float > > getTopicTopNLabels(std::size_t topic, std::size_t n) const
Gets the top N labels for the specified topic.
Definition: TopicModel.hpp:1134

crawlservpp::Data::PickleDict::getFloat
std::optional< double > getFloat(const std::string &key) const
Gets a floating-point number from the dictionary, if avaible.
Definition: PickleDict.hpp:426

crawlservpp::Data::TopicModel::setLabelingOptions
void setLabelingOptions(bool activate, std::size_t minCf, std::size_t minDf, std::size_t minLength, std::size_t maxLength, std::size_t maxCandidates, float smoothing, float mu, std::size_t windowSize)
Sets the options for automated topic labeling.
Definition: TopicModel.hpp:1582

crawlservpp::Data::TopicModel::setFixedNumberOfTopics
void setFixedNumberOfTopics(std::size_t k)
Sets the fixed number of topics.
Definition: TopicModel.hpp:1394

crawlservpp::Struct::TopicModelInfo
Structure containing information about the currently trained Hierarchical Dirichlet Process (HDP) mod...
Definition: TopicModelInfo.hpp:72

crawlservpp::Data::TopicModel::getModelInfo
TopicModelInfo getModelInfo() const
Gets information about the model after training.
Definition: TopicModel.hpp:1322

crawlservpp::Data::PickleDict
Simple Python pickle dictionary.
Definition: PickleDict.hpp:136

crawlservpp::Data::TopicModel::setParameterOptimizationInterval
void setParameterOptimizationInterval(std::size_t interval)
Sets the interval for parameter optimization, in iterations.
Definition: TopicModel.hpp:1521

crawlservpp::Struct::TopicModelInfo::numberOfTopTokensToBeRemoved
std::size_t numberOfTopTokensToBeRemoved
The number of top tokens to be removed.
Definition: TopicModelInfo.hpp:130

MAIN_EXCEPTION_CLASS
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50

crawlservpp::Data::TopicModel::getIterations
std::size_t getIterations() const
Get the number of training iterations performed so far.
Definition: TopicModel.hpp:740

crawlservpp::Data::TopicModel::getTopics
std::vector< std::size_t > getTopics() const
Gets the IDs of the topics.
Definition: TopicModel.hpp:931

crawlservpp::Struct::TopicModelInfo::sizeOfVocabularyUsed
std::size_t sizeOfVocabularyUsed
Definition: TopicModelInfo.hpp:92

crawlservpp::Data::modelFileHead
constexpr auto modelFileHead
The beginning of a valid model file containing a LDA (or HDP) model.
Definition: TopicModel.hpp:207

crawlservpp::Data::TopicModel::label
void label(std::size_t threads)
Labels the resulting topics.
Definition: TopicModel.hpp:1736

crawlservpp::Data::TopicModel::setInitialParameters
void setInitialParameters(std::size_t initialTopics, float alpha, float eta, float gamma)
Sets the initial parameters for the model.
Definition: TopicModel.hpp:1495

crawlservpp::Data::TopicModel::startTraining
void startTraining()
Starts training without performing any iteration.
Definition: TopicModel.hpp:1680

crawlservpp::Struct::TopicModelInfo::initialAlpha
float initialAlpha
The initial concentration coefficient of the Dirichlet Process for document–table.
Definition: TopicModelInfo.hpp:136

crawlservpp::Data::PickleDict::setNumber
void setNumber(const std::string &key, std::int64_t value)
Adds or overwrite a number in the dictionary.
Definition: PickleDict.hpp:467

crawlservpp::Data::TopicModel::getDocumentId
std::size_t getDocumentId(const std::string &name) const
Gets the ID of the document with the specified name.
Definition: TopicModel.hpp:842

crawlservpp::Struct::TopicModelInfo::removedTokens
std::vector< std::string > removedTokens
The top tokens removed before training.
Definition: TopicModelInfo.hpp:98

crawlservpp::Data::PickleDict::setFloat
void setFloat(const std::string &key, double value)
Adds or overwrites a floating-point number in the dictionary.
Definition: PickleDict.hpp:480

crawlservpp::Struct::TopicModelInfo::alpha
float alpha
The concentration coeficient of the Dirichlet Process for document-table (HDP only).
Definition: TopicModelInfo.hpp:155

crawlservpp::Data::TopicModel::getNumberOfDocuments
std::size_t getNumberOfDocuments() const
Gets the number of added documents after training has begun.
Definition: TopicModel.hpp:587

crawlservpp::Data::TopicModel::getBurnInIterations
std::size_t getBurnInIterations() const
Get the number of skipped iterations.
Definition: TopicModel.hpp:720

crawlservpp::Data::TopicModel::Exception
Class for topic modelling-specific exceptions.
Definition: TopicModel.hpp:376

crawlservpp::Helper::Versions::getTomotoVersion
std::string getTomotoVersion()
Gets the version of the tomoto library if available.
Definition: Versions.hpp:341

crawlservpp::Struct::TopicModelInfo::numberOfTokens
std::size_t numberOfTokens
The number of tokens in the model.
Definition: TopicModelInfo.hpp:86

crawlservpp::Struct::TopicModelInfo::numberOfIterations
std::size_t numberOfIterations
The number of iterations performed.
Definition: TopicModelInfo.hpp:105

crawlservpp::Helper::Container::bytes
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144

crawlservpp::Data::TopicModel::getParameterOptimizationInterval
std::size_t getParameterOptimizationInterval() const
Gets the interval for parameter optimization, in iterations.
Definition: TopicModel.hpp:760

crawlservpp::Data::PickleDict::writeTo
void writeTo(Bytes &dataTo) const
Writes dictionary to Python pickle data.
Definition: PickleDict.hpp:565

crawlservpp::Data::PickleDict::getNumber
std::optional< std::int64_t > getNumber(const std::string &key) const
Gets a number from the dictionary, if avaible.
Definition: PickleDict.hpp:406

crawlservpp::Data::defaultEta
constexpr auto defaultEta
The default hyperparameter for the Dirichlet distribution for topic-token.
Definition: TopicModel.hpp:194

crawlservpp::Struct::TopicModelInfo::minDocumentFrequency
std::size_t minDocumentFrequency
Minimum document frequency of tokens.
Definition: TopicModelInfo.hpp:127

crawlservpp::Data::TopicModel::getLogLikelihoodPerToken
double getLogLikelihoodPerToken() const
Gets the log-likelihood per token.
Definition: TopicModel.hpp:1012

crawlservpp::Data::TopicModel::setTokenRemoval
void setTokenRemoval(std::size_t collectionFrequency, std::size_t documentFrequency, std::size_t fixedNumberOfTopTokens)
Sets which (un)common tokens to remove before training.
Definition: TopicModel.hpp:1460

crawlservpp::Data::defaultOptimizationInterval
constexpr auto defaultOptimizationInterval
The default interval for optimizing the parameters, in iterations.
Definition: TopicModel.hpp:204

crawlservpp::Data::ldaModelName
constexpr auto ldaModelName
The name of the LDA model.
Definition: TopicModel.hpp:185

crawlservpp::Struct::TopicModelInfo::modelName
std::string modelName
The name of the model.
Definition: TopicModelInfo.hpp:77

crawlservpp::Data::TopicModel::setRandomNumberGenerationSeed
void setRandomNumberGenerationSeed(std::size_t newSeed)
Sets the seed for random number generation.
Definition: TopicModel.hpp:1538

crawlservpp::Struct::TopicModelInfo::numberOfTables
std::size_t numberOfTables
The number of tables.
Definition: TopicModelInfo.hpp:178

crawlservpp::Struct::TopicModelInfo::initialEta
float initialEta
The initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelInfo.hpp:139

crawlservpp::Data::TopicModel::getTokenEntropy
double getTokenEntropy() const
Gets the token entropy after training.
Definition: TopicModel.hpp:1031

crawlservpp::Struct::TopicModelInfo::weighting
std::string weighting
Term weighting mode as string.
Definition: TopicModelInfo.hpp:121

crawlservpp::Data::TopicModel::getVocabulary
const std::vector< std::string > & getVocabulary() const
Gets the complete dictionary used by the model.
Definition: TopicModel.hpp:680

DATA_TOPICMODEL_RETURN
#define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function)
Definition: TopicModel.hpp:155

crawlservpp::Data::PickleDict::getString
std::optional< std::string > getString(const std::string &key) const
Gets a string from the dictionary, if avaible.
Definition: PickleDict.hpp:446

crawlservpp::Data::TopicModel::getVocabularySize
std::size_t getVocabularySize() const
Gets the number of distinct tokens after training has begun.
Definition: TopicModel.hpp:639

crawlservpp::Data::TopicModel::getModelName
std::string_view getModelName() const
Gets the name of the current model.
Definition: TopicModel.hpp:799

crawlservpp::Data::defaultNumberOfInitialTopics
constexpr auto defaultNumberOfInitialTopics
The initial number of topics by default.
Definition: TopicModel.hpp:188

crawlservpp::Data::defaultGamma
constexpr auto defaultGamma
The default concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModel.hpp:201

DATA_TOPICMODEL_RETRIEVE
#define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function,...)
Definition: TopicModel.hpp:135

crawlservpp::Data::modelFileTermWeightingOne
constexpr auto modelFileTermWeightingOne
The term weighting scheme ONE as saved in a model file.
Definition: TopicModel.hpp:213

crawlservpp::Struct::TopicModelInfo::alphas
std::vector< float > alphas
The Dirichlet priors on the per-document topic distributions (LDA only).
Definition: TopicModelInfo.hpp:158

crawlservpp::Data::TopicModel::getDocumentsTopics
std::vector< std::pair< std::string, std::vector< float > > > getDocumentsTopics(std::unordered_set< std::string > &done) const
Gets the topic distributions of all documents the model has been trained on, if available.
Definition: TopicModel.hpp:1180

crawlservpp::Struct::TopicModelInfo::seed
std::size_t seed
The initial seed for random number generation.
Definition: TopicModelInfo.hpp:145

crawlservpp::Data::hdpModelName
constexpr auto hdpModelName
The name of the HDP model.
Definition: TopicModel.hpp:182

crawlservpp::Struct::TopicModelInfo::numberOfDocuments
std::size_t numberOfDocuments
The number of documents in the model.
Definition: TopicModelInfo.hpp:83

crawlservpp::Data::TopicModel::setBurnInIteration
void setBurnInIteration(std::size_t skipIterations)
Sets the number of iterations that will be skipped at the beginnig of training.
Definition: TopicModel.hpp:1428

crawlservpp::Data::TopicModel::load
std::size_t load(const std::string &fileName)
Loads a model from a file.
Definition: TopicModel.hpp:1804

crawlservpp::Struct::TopicModelInfo::trainedWithVersion
std::string trainedWithVersion
The version of the modeller the model has been trained with.
Definition: TopicModelInfo.hpp:148

crawlservpp::Data::TopicModel
Topic modeller.
Definition: TopicModel.hpp:257

crawlservpp::Data::TopicModel::clear
void clear(bool labelingOptions)
Clears the model, resets its settings and frees memory.
Definition: TopicModel.hpp:1949

crawlservpp::Struct::TopicModelInfo::initialGamma
float initialGamma
The initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelInfo.hpp:142

crawlservpp::Struct::TopicModelInfo::logLikelihoodPerToken
double logLikelihoodPerToken
The log-likelihood per token.
Definition: TopicModelInfo.hpp:114

crawlservpp::Data::modelFileType
constexpr auto modelFileType
The tomoto file format as saved in a model file (after model head and term weighting scheme)...
Definition: TopicModel.hpp:219

PickleDict.hpp

crawlservpp::Data::TopicModel::getTopicsSorted
std::vector< std::pair< std::size_t, std::uint64_t > > getTopicsSorted() const
Gets the IDs and counts of the topics, sorted by count.
Definition: TopicModel.hpp:972

crawlservpp::Struct::TopicModelInfo::eta
float eta
The Dirichlet prior on the per-topic token distribution (HDP only).
Definition: TopicModelInfo.hpp:161

crawlservpp::Data::defaultAlpha
constexpr auto defaultAlpha
The default concentration coeficient of the Dirichlet Process for document-table. ...
Definition: TopicModel.hpp:191

crawlservpp::Struct::TopicModelInfo::modelVersion
std::string modelVersion
The version of the model (as string).
Definition: TopicModelInfo.hpp:80

crawlservpp::Data::TopicModel::getTermWeighting
std::string_view getTermWeighting() const
Gets the term weighting mode of the current model.
Definition: TopicModel.hpp:821

crawlservpp::Data::TopicModel::setUseIdf
void setUseIdf(bool idf)
Sets whether to use IDF term weighting.
Definition: TopicModel.hpp:1411

crawlservpp::Data
Namespace for different types of data.

DATA_TOPICMODEL_CALL
#define DATA_TOPICMODEL_CALL(isHdp, isIdf, function,...)
Definition: TopicModel.hpp:96

crawlservpp::Data::TopicModel::getRemovedTokens
std::vector< std::string > getRemovedTokens() const
Gets the most common tokens (i.e. stopwords) that have been removed.
Definition: TopicModel.hpp:875

crawlservpp::Struct::TopicModelInfo::gamma
float gamma
The concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModelInfo.hpp:168

crawlservpp::Data::TopicModel::getOriginalVocabularySize
std::size_t getOriginalVocabularySize() const
Gets the number of distinct tokens before training.
Definition: TopicModel.hpp:659

crawlservpp::Helper::Memory::free
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42

crawlservpp::Data::TopicModel::getNumberOfTopics
std::size_t getNumberOfTopics() const
Gets the number of topics.
Definition: TopicModel.hpp:906

crawlservpp::Data::TopicModel::getRandomNumberGenerationSeed
std::size_t getRandomNumberGenerationSeed() const
Gets the seed used for random number generation.
Definition: TopicModel.hpp:780

crawlservpp::Struct::TopicModelInfo::numberOfInitialTopics
std::size_t numberOfInitialTopics
The initial number of topics, which will be adjusted for the data during training.
Definition: TopicModelInfo.hpp:133

DATA_TOPICMODEL_RETRIEVE_NOARGS
#define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function)
Definition: TopicModel.hpp:116

crawlservpp::Struct::TopicModelInfo::optimizationInterval
std::size_t optimizationInterval
The optimization interval.
Definition: TopicModelInfo.hpp:111

crawlservpp::Data::TopicModel::getTopicTopNTokens
std::vector< std::pair< std::string, float > > getTopicTopNTokens(std::size_t topic, std::size_t n) const
Gets the top N tokens for the specified topic.
Definition: TopicModel.hpp:1092

crawlservpp::Data::TopicModel::addDocument
void addDocument(const std::string &name, const std::vector< std::string > &tokens, std::size_t firstToken, std::size_t numTokens)
Adds a document from a tokenized corpus.
Definition: TopicModel.hpp:1634

crawlservpp::Data::PickleDict::setString
void setString(const std::string &key, const std::string &value)
Add or overwrites a string in the dictionary.
Definition: PickleDict.hpp:493