crawlserv/crawlservpp/TopicModelInfo_8hpp_source.html

 /*
  *
  * ---
  *
  *  Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version in addition to the terms of any
  *  licences already herein identified.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program. If not, see <https://www.gnu.org/licenses/>.
  *
  * ---
  *
  * TopicModelInfo.hpp
  *
  * Structure with information about
  *  Hierarchical Dirichlet Process (HDP) models.
  *
  * Using tomoto, the underlying C++ API of tomotopy, see:
  *  https://bab2min.github.io/tomotopy/
  *
  * If you use the HDP topic modelling algorithm, please cite:
  *
  *  Teh, Y. W., Jordan, M. I., Beal, M. J., & Blei, D. M. (2005). Sharing
  *   clusters among related groups: Hierarchical Dirichlet processes.
  *   In Advances in neural information processing systems, 1385–1392.
  *
  *  Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
  *   algorithms for topic models. Journal of Machine Learning Research,
  *   10 (Aug), 1801–1828.
  *
  * If you use the LDA topic modelling algorithm, please cite:
  *
  *  Blei, D. M., Ng, A. Y., & Jordan, M. I. (2003). Latent dirichlet
  *   allocation. Journal of machine Learning research, 3(Jan), 993–1022.
  *
  *  Newman, D., Asuncion, A., Smyth, P., & Welling, M. (2009). Distributed
  *   algorithms for topic models. Journal of Machine Learning Research,
  *   10 (Aug), 1801–1828.
  *
  * If you use automated topic labeling, please cite:
  *
  *  Mei, Q., Shen, X., & Zhai, C. (2007). Automatic labeling of multinomial
  *   topic models. In Proceedings of the 13th ACM SIGKDD International
  *   Conference on Knowledge Discovery and Data Mining, 490–499.
  *
  *  Created on: Feb 2, 2021
  *      Author: ans
  */

 #ifndef STRUCT_HDPMODELINFO_HPP_
 #define STRUCT_HDPMODELINFO_HPP_

 #include <cstddef>  // std::size_t
 #include <queue>    // std::queue
 #include <sstream>  // std::ostringstream
 #include <string>   // std::string, std::to_string
 #include <vector>   // std::vector

 namespace crawlservpp::Struct {

     struct TopicModelInfo {

         std::string modelName;

         std::string modelVersion;

         std::size_t numberOfDocuments{};

         std::size_t numberOfTokens{};

         // The number of unique tokens in the model.
         std::size_t sizeOfVocabulary{};

         // The number of unique tokens used in the model.
         std::size_t sizeOfVocabularyUsed{};

         double tokenEntropy{};

         std::vector<std::string> removedTokens;


         std::size_t numberOfIterations{};

         std::size_t numberOfBurnInSteps{};

         std::size_t optimizationInterval{};

         double logLikelihoodPerToken{};


         std::string weighting;

         std::size_t minCollectionFrequency{};

         std::size_t minDocumentFrequency{};

         std::size_t numberOfTopTokensToBeRemoved{};

         std::size_t numberOfInitialTopics{};

         float initialAlpha{};

         float initialEta{};

         float initialGamma{};

         std::size_t seed{};

         std::string trainedWithVersion{};


         float alpha{};

         std::vector<float> alphas;

         float eta{};


         float gamma{};

         std::size_t numberOfTopics{};


         std::size_t numberOfTables{};


         [[nodiscard]] std::queue<std::string> toQueueOfStrings() const {
             std::queue<std::string> result;

             result.emplace("<Basic Info>");
             result.emplace(
                     "| "
                     + this->modelName
                     + " (current version: "
                     + this->modelVersion
                     + ")"
             );
             result.emplace(
                     "| "
                     +
                     std::to_string(this->numberOfDocuments)
                     + " docs, "
                     + std::to_string(this->numberOfTokens)
                     + " tokens"
             );
             result.emplace(
                     "| Total Vocabs: "
                     + std::to_string(this->sizeOfVocabulary)
                     + ", Used Vocabs: "
                     + std::to_string(this->sizeOfVocabularyUsed)
             );
             result.emplace(
                     "| Entropy of tokens: "
                     + std::to_string(this->tokenEntropy)
             );

             std::string removed{"| Removed Vocabs:"};

             if(this->removedTokens.empty()) {
                 removed += " <NA>";
             }
             else {
                 for(const auto& token : this->removedTokens) {
                     removed.push_back(' ');

                     removed += token;
                 }
             }

             result.emplace(removed);
             result.emplace("|");
             result.emplace("<Training Info>");
             result.emplace(
                     "| Iterations: "
                     + std::to_string(this->numberOfIterations)
                     + ", Burn-in steps: "
                     + std::to_string(this->numberOfBurnInSteps)
             );
             result.emplace(
                     "| Optimization Interval: "
                     + std::to_string(this->optimizationInterval)
             );
             result.emplace(
                     "| Log-likelihood per token: "
                     + std::to_string(this->logLikelihoodPerToken)
             );
             result.emplace("|");
             result.emplace("<Initial Parameters>");
             result.emplace("| tw: " + this->weighting);
             result.emplace(
                     "| min_cf: "
                     + std::to_string(this->minCollectionFrequency)
                     + " (minimum collection frequency of tokens)"
             );
             result.emplace(
                     "| min_df: "
                     + std::to_string(this->minDocumentFrequency)
                     + " (minimum document frequency of tokens)"
             );
             result.emplace(
                     "| rm_top: "
                     + std::to_string(this->numberOfTopTokensToBeRemoved)
                     + " (the number of top tokens to be removed)"
             );
             if(this->numberOfInitialTopics > 0) {
                 result.emplace(
                         "| initial_k: "
                         + std::to_string(this->numberOfInitialTopics)
                         +   " (the initial number of topics between 2 ~ 32767,"
                             " which will be adjusted for data during training)"
                 );
             }
             else {
                 result.emplace(
                         "| k: "
                         + std::to_string(this->numberOfTopics)
                         + " (the number of topics between 1 ~ 32767)"
                 );
             }
             result.emplace(
                     "| alpha: "
                     + std::to_string(this->initialAlpha)
                     + " (concentration coeficient of Dirichlet Process for document-topic)"
             );
             result.emplace(
                     "| eta: "
                     + std::to_string(this->initialEta)
                     + " (hyperparameter of Dirichlet distribution for topic-token)"
             );

             if(this->initialGamma > 0.) { /* only used by HDP */
                 result.emplace(
                         "| gamma: "
                         + std::to_string(this->initialGamma)
                         + " (concentration coeficient of Dirichlet Process for table-topic)"
                 );
             }

             result.emplace(
                     "| seed: "
                     + std::to_string(this->seed)
                     + " (random seed)"
             );
             if(!(this->trainedWithVersion.empty())) {
                 result.emplace(
                         "| trained in version " + this->trainedWithVersion
                 );
             }
             result.emplace("|");
             result.emplace("<Parameters>");
             if(this->alphas.empty()) {
                 result.emplace("| alpha (concentration coeficient of Dirichlet Process for document-table)");
                 result.emplace("|  " + std::to_string(this->alpha));
             }
             else { /* only used by LDA */
                 result.emplace("| alpha (Dirichlet prior on the per-document topic distributions)");

                 constexpr uint8_t lineBreakAfter{6};
                 std::string line{"|  ["};
                 std::uint8_t lineN{};

                 for(const auto a : this->alphas) {
                     if(lineN == lineBreakAfter) {
                         // remove last space and add line
                         line.pop_back();

                         result.emplace(line);

                         line = "|   ";

                         lineN = 0;
                     }

                     line += std::to_string(a) + " ";

                     ++lineN;
                 }

                 line.back() = ']';

                 result.emplace(line);
             }
             result.emplace("| eta (Dirichlet prior on the per-topic token distribution)");
             result.emplace("|  " + std::to_string(this->eta));

             if(gamma > 0.) { /* only used by HDP */
                 result.emplace("| gamma (concentration coeficient of Dirichlet Process for table-topic)");
                 result.emplace("|  " + std::to_string(this->gamma));
             }

             result.emplace("|");
             result.emplace("| Number of Topics: " + std::to_string(this->numberOfTopics));

             if(this->numberOfTables > 0) { /* only used by HDP */
                 result.emplace("| Number of Tables: " + std::to_string(this->numberOfTables));
             }

             return result;
         }
     };

 } /* namespace crawlservpp::Struct */

 #endif /* STRUCT_HDPMODELINFO_HPP_ */
crawlservpp::Struct::TopicModelInfo::minCollectionFrequency
std::size_t minCollectionFrequency
Minimum collection frequency of tokens.
Definition: TopicModelInfo.hpp:124

crawlservpp::Struct::TopicModelInfo::toQueueOfStrings
std::queue< std::string > toQueueOfStrings() const
Return queue with strings describing the information contained in the structure.
Definition: TopicModelInfo.hpp:185

crawlservpp::Struct::TopicModelInfo::numberOfBurnInSteps
std::size_t numberOfBurnInSteps
The number of initially skipped, i.e. burn-in, steps.
Definition: TopicModelInfo.hpp:108

crawlservpp::Struct::TopicModelInfo::numberOfTopics
std::size_t numberOfTopics
The number of topics.
Definition: TopicModelInfo.hpp:171

crawlservpp::Struct::TopicModelInfo::tokenEntropy
double tokenEntropy
The entropy of tokens in the model.
Definition: TopicModelInfo.hpp:95

crawlservpp::Struct::TopicModelInfo::sizeOfVocabulary
std::size_t sizeOfVocabulary
Definition: TopicModelInfo.hpp:89

crawlservpp::Struct::TopicModelInfo
Structure containing information about the currently trained Hierarchical Dirichlet Process (HDP) mod...
Definition: TopicModelInfo.hpp:72

crawlservpp::Struct::TopicModelInfo::numberOfTopTokensToBeRemoved
std::size_t numberOfTopTokensToBeRemoved
The number of top tokens to be removed.
Definition: TopicModelInfo.hpp:130

crawlservpp::Struct::TopicModelInfo::sizeOfVocabularyUsed
std::size_t sizeOfVocabularyUsed
Definition: TopicModelInfo.hpp:92

crawlservpp::Struct::TopicModelInfo::initialAlpha
float initialAlpha
The initial concentration coefficient of the Dirichlet Process for document–table.
Definition: TopicModelInfo.hpp:136

crawlservpp::Struct::TopicModelInfo::removedTokens
std::vector< std::string > removedTokens
The top tokens removed before training.
Definition: TopicModelInfo.hpp:98

crawlservpp::Struct::TopicModelInfo::alpha
float alpha
The concentration coeficient of the Dirichlet Process for document-table (HDP only).
Definition: TopicModelInfo.hpp:155

crawlservpp::Struct::TopicModelInfo::numberOfTokens
std::size_t numberOfTokens
The number of tokens in the model.
Definition: TopicModelInfo.hpp:86

crawlservpp::Struct::TopicModelInfo::numberOfIterations
std::size_t numberOfIterations
The number of iterations performed.
Definition: TopicModelInfo.hpp:105

crawlservpp::Struct::TopicModelInfo::minDocumentFrequency
std::size_t minDocumentFrequency
Minimum document frequency of tokens.
Definition: TopicModelInfo.hpp:127

crawlservpp::Struct::TopicModelInfo::modelName
std::string modelName
The name of the model.
Definition: TopicModelInfo.hpp:77

crawlservpp::Struct::TopicModelInfo::numberOfTables
std::size_t numberOfTables
The number of tables.
Definition: TopicModelInfo.hpp:178

crawlservpp::Struct::TopicModelInfo::initialEta
float initialEta
The initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelInfo.hpp:139

crawlservpp::Struct::TopicModelInfo::weighting
std::string weighting
Term weighting mode as string.
Definition: TopicModelInfo.hpp:121

crawlservpp::Struct::TopicModelInfo::alphas
std::vector< float > alphas
The Dirichlet priors on the per-document topic distributions (LDA only).
Definition: TopicModelInfo.hpp:158

crawlservpp::Struct::TopicModelInfo::seed
std::size_t seed
The initial seed for random number generation.
Definition: TopicModelInfo.hpp:145

crawlservpp::Struct::TopicModelInfo::numberOfDocuments
std::size_t numberOfDocuments
The number of documents in the model.
Definition: TopicModelInfo.hpp:83

crawlservpp::Struct::TopicModelInfo::trainedWithVersion
std::string trainedWithVersion
The version of the modeller the model has been trained with.
Definition: TopicModelInfo.hpp:148

crawlservpp::Struct::TopicModelInfo::initialGamma
float initialGamma
The initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelInfo.hpp:142

crawlservpp::Struct::TopicModelInfo::logLikelihoodPerToken
double logLikelihoodPerToken
The log-likelihood per token.
Definition: TopicModelInfo.hpp:114

crawlservpp::Struct
Namespace for data structures.
Definition: AlgoThreadProperties.hpp:43

crawlservpp::Struct::TopicModelInfo::eta
float eta
The Dirichlet prior on the per-topic token distribution (HDP only).
Definition: TopicModelInfo.hpp:161

crawlservpp::Struct::TopicModelInfo::modelVersion
std::string modelVersion
The version of the model (as string).
Definition: TopicModelInfo.hpp:80

crawlservpp::Struct::TopicModelInfo::gamma
float gamma
The concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModelInfo.hpp:168

crawlservpp::Struct::TopicModelInfo::numberOfInitialTopics
std::size_t numberOfInitialTopics
The initial number of topics, which will be adjusted for the data during training.
Definition: TopicModelInfo.hpp:133

crawlservpp::Struct::TopicModelInfo::optimizationInterval
std::size_t optimizationInterval
The optimization interval.
Definition: TopicModelInfo.hpp:111