63 #ifndef DATA_TOPICMODEL_HPP_ 64 #define DATA_TOPICMODEL_HPP_ 68 #include "../Helper/FileSystem.hpp" 69 #include "../Helper/Memory.hpp" 70 #include "../Helper/SilentInclude/EigenRand.h" 71 #include "../Helper/SilentInclude/tomoto.h" 72 #include "../Helper/Versions.hpp" 73 #include "../Main/Exception.hpp" 74 #include "../Struct/TopicModelInfo.hpp" 88 #include <string_view> 89 #include <unordered_map> 90 #include <unordered_set> 96 #define DATA_TOPICMODEL_CALL(isHdp, isIdf, function, ...) \ 99 this->hdpModelIdf->function(__VA_ARGS__); \ 102 this->hdpModel->function(__VA_ARGS__); \ 107 this->ldaModelIdf->function(__VA_ARGS__); \ 110 this->ldaModel->function(__VA_ARGS__); \ 116 #define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function) \ 119 (x) = this->hdpModelIdf->function(); \ 122 (x) = this->hdpModel->function(); \ 127 (x) = this->ldaModelIdf->function(); \ 130 (x) = this->ldaModel->function(); \ 135 #define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function, ...) \ 138 (x) = this->hdpModelIdf->function(__VA_ARGS__); \ 141 (x) = this->hdpModel->function(__VA_ARGS__); \ 146 (x) = this->ldaModelIdf->function(__VA_ARGS__); \ 149 (x) = this->ldaModel->function(__VA_ARGS__); \ 155 #define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function) \ 158 return this->hdpModelIdf->function(); \ 161 return this->hdpModel->function(); \ 165 return this->ldaModelIdf->function(); \ 168 return this->ldaModel->function(); 172 using std::string_view_literals::operator
""sv;
261 using HDPModel = tomoto::HDPModel<tomoto::TermWeight::one, tomoto::RandGen>;
262 using HDPModelIDF = tomoto::HDPModel<tomoto::TermWeight::idf, tomoto::RandGen>;
263 using LDAModel = tomoto::LDAModel<tomoto::TermWeight::one, tomoto::RandGen>;
264 using LDAModelIDF = tomoto::LDAModel<tomoto::TermWeight::idf, tomoto::RandGen>;
266 using FoRelevance = tomoto::label::FoRelevance;
267 using ITopicModel = tomoto::ITopicModel;
268 using PMIExtractor = tomoto::label::PMIExtractor;
275 [[nodiscard]] std::unordered_map<std::string, std::size_t>
getDocuments()
const;
278 [[nodiscard]]
const std::vector<std::string>&
getVocabulary()
const;
286 [[nodiscard]] std::size_t
getDocumentId(
const std::string& name)
const;
289 [[nodiscard]] std::vector<std::size_t>
getTopics()
const;
290 [[nodiscard]] std::vector<std::pair<std::size_t, std::uint64_t>>
getTopicsSorted()
const;
301 [[nodiscard]] std::vector<std::pair<std::string, std::vector<float>>>
getDocumentsTopics(
302 std::unordered_set<std::string>& done
305 const std::vector<std::vector<std::string>>& documents,
306 std::size_t maxIterations,
307 std::size_t numberOfWorkers
319 std::size_t collectionFrequency,
320 std::size_t documentFrequency,
321 std::size_t fixedNumberOfTopTokens
324 std::size_t initialTopics,
335 std::size_t minLength,
336 std::size_t maxLength,
337 std::size_t maxCandidates,
340 std::size_t windowSize
348 const std::string& name,
349 const std::vector<std::string>& tokens,
350 std::size_t firstToken,
351 std::size_t numTokens
355 std::size_t iterations,
358 void label(std::size_t threads);
364 std::size_t
load(
const std::string& fileName);
365 std::size_t
save(
const std::string& fileName,
bool full)
const;
371 void clear(
bool labelingOptions);
380 std::unique_ptr<HDPModel> hdpModel;
381 std::unique_ptr<HDPModelIDF> hdpModelIdf;
382 std::unique_ptr<LDAModel> ldaModel;
383 std::unique_ptr<LDAModelIDF> ldaModelIdf;
386 std::vector<std::string> docNames;
390 bool isPrepared{
false};
391 std::size_t workersUsed{};
394 std::size_t fixedNumberOfTopics{};
395 bool isUseIdf{
false};
400 std::size_t seed{std::random_device{}()};
401 std::size_t minTokenCf{};
402 std::size_t minTokenDf{};
403 std::size_t removeTopNTokens{};
405 std::string trainedWithVersion{};
408 std::unique_ptr<FoRelevance> labeler;
409 bool isLabeling{
false};
410 std::size_t labelingMinCf{};
411 std::size_t labelingMinDf{};
412 std::size_t labelingMinLength{};
413 std::size_t labelingMaxLength{};
414 std::size_t labelingMaxCandidates{};
415 float labelingSmoothing{};
417 std::size_t labelingWindowSize{};
420 void initModel(
bool& isHdpTo,
bool& isIdfTo);
421 [[nodiscard]] std::string dictLookUp(tomoto::Vid tokenId)
const;
424 const std::string&
function,
429 const std::string&
function,
430 const std::string& errorMsg
432 void checkTrained(
const std::string&
function)
const;
433 void checkNotTrained(
434 const std::string&
function,
435 const std::string& errorMsg
438 [[nodiscard]]
const tomoto::Dictionary& getDict(
442 [[nodiscard]] std::size_t getLiveK(
bool isIdf)
const;
443 [[nodiscard]] std::size_t getK(
bool isHdp,
bool isIdf)
const;
444 [[nodiscard]]
bool isLiveTopic(
bool isIdf, std::size_t topic)
const;
445 [[nodiscard]]
float getGamma(
bool isIdf)
const;
446 [[nodiscard]] std::size_t getNumberOfTables(
bool isIdf)
const;
448 void prepareModel(
bool isHdp,
bool isIdf);
452 std::size_t iterations,
456 void loadModelInformation(
459 const std::vector<std::uint8_t>& data
461 void writeModelInformation(
464 std::vector<std::uint8_t>& dataTo
467 [[nodiscard]] std::vector<float> getInferredTopics(
470 const tomoto::DocumentBase * doc
473 [[nodiscard]]
const void *
get(
bool isHdp,
bool isIdf)
const;
476 [[nodiscard]]
static tomoto::RawDoc createDocument(
477 const std::string& name,
478 const std::vector<std::string>& tokens,
479 std::size_t firstToken,
480 std::size_t numTokens
482 static void readModelFileHead(std::istream& in,
const std::string& fileName);
483 static void readModelFileTermWeighting(
485 const std::string& fileName,
488 static void readModelFileType(std::istream& in,
const std::string& fileName);
489 static void resetStream(std::istream& in);
490 static void numberFromDict(
492 const std::string& key,
495 static void floatFromDict(
497 const std::string& key,
500 static void stringFromDict(
502 const std::string& key,
506 static void validateLastResults(
507 std::vector<std::pair<std::string, std::vector<float>>>& results,
508 std::unordered_set<std::string>& done,
509 const std::unordered_set<std::string>::const_iterator& inserted
513 [[nodiscard]]
static constexpr std::string_view termWeightToString(
bool isIdf) {
515 return "TermWeight.IDF";
518 return "TermWeight.ONE";
521 template<
typename T> [[nodiscard]]
static bool bytesEqual(
525 if(bytes.size() != s.size()) {
529 for(std::size_t index{}; index < bytes.size(); ++index) {
530 if(bytes[index] != s[index]) {
538 template<
typename T> [[nodiscard]]
static std::string bytesToString(
543 for(
const auto c : bytes) {
552 template<tomoto::TermWeight _tw,
typename _RandGen>
553 [[nodiscard]]
static std::vector<float> removeDeadTopics(
554 const std::vector<float>& results,
555 const std::unique_ptr<tomoto::HDPModel<_tw, _RandGen>>& model
557 std::vector<float> filtered;
559 filtered.reserve(results.size());
561 for(std::size_t topic{}; topic < results.size(); ++topic) {
562 if(model->isLiveTopic(topic)) {
563 filtered.push_back(results[topic]);
591 this->checkModel(
"getNumberOfDocuments", isHdp, isIdf);
592 this->checkTrained(
"getNumberOfDocuments");
611 this->checkModel(
"getDocuments", isHdp, isIdf);
612 this->checkTrained(
"getDocuments");
614 std::unordered_map<std::string, std::size_t> result;
617 const tomoto::DocumentBase * docPtr{
nullptr};
622 if(!(docPtr->docUid.empty())) {
623 result[docPtr->docUid] = index;
643 this->checkModel(
"getVocabularySize", isHdp, isIdf);
644 this->checkTrained(
"getVocabularySize");
663 this->checkModel(
"getOriginalVocabularySize", isHdp, isIdf);
664 this->checkTrained(
"getOriginalVocabularySize");
666 return this->getDict(isHdp, isIdf).size();
684 this->checkModel(
"getVocabulary", isHdp, isIdf);
685 this->checkTrained(
"getVocabulary");
687 return this->getDict(isHdp, isIdf).getRaw();
704 this->checkModel(
"getNumberOfTokens", isHdp, isIdf);
705 this->checkTrained(
"getNumberOfTokens");
724 this->checkModel(
"getBurnInIterations", isHdp, isIdf);
725 this->checkTrained(
"getBurnInIterations");
744 this->checkModel(
"getIterations", isHdp, isIdf);
745 this->checkTrained(
"getIterations");
764 this->checkModel(
"getParameterOptimizationInterval", isHdp, isIdf);
765 this->checkTrained(
"getParameterOptimizationInterval");
784 this->checkModel(
"getRandomNumberGenerationSeed", isHdp, isIdf);
785 this->checkTrained(
"getRandomNumberGenerationSeed");
803 this->checkModel(
"getModelName", isHdp, isIdf);
825 this->checkModel(
"getTermWeighting", isHdp, isIdf);
827 return TopicModel::termWeightToString(isIdf);
846 this->checkModel(
"getDocumentId", isHdp, isIdf);
853 if(
id == std::numeric_limits<std::size_t>::max()) {
856 " No document named '" 858 +
"' has been added to the model" 879 this->checkModel(
"getRemovedTokens", isHdp, isIdf);
880 this->checkTrained(
"getRemovedTokens");
883 this->getDict(isHdp, isIdf)
885 const auto& size{dict.size()};
886 std::vector<std::string> removed;
888 for(
auto tokendIndex{size - this->removeTopNTokens}; tokendIndex < size; ++tokendIndex) {
889 removed.emplace_back(dict.toWord(tokendIndex));
910 this->checkModel(
"getNumberOfTopics", isHdp, isIdf);
911 this->checkTrained(
"getNumberOfTopics");
914 return this->getLiveK(isIdf);
917 return this->fixedNumberOfTopics;
935 this->checkModel(
"getTopics", isHdp, isIdf);
936 this->checkTrained(
"getTopics");
938 std::vector<std::size_t> topicIds;
942 topicIds.reserve(this->getLiveK(isIdf));
944 maxK = this->getK(
true, isIdf);
946 for(std::size_t k{}; k < maxK; ++k) {
947 if(this->isLiveTopic(isIdf, k)) {
948 topicIds.emplace_back(k);
953 topicIds.reserve(this->fixedNumberOfTopics);
955 for(std::size_t k{}; k < this->fixedNumberOfTopics; ++k) {
956 topicIds.emplace_back(k);
976 this->checkModel(
"getTopicsSorted", isHdp, isIdf);
977 this->checkTrained(
"getTopicsSorted");
979 std::vector<std::pair<std::size_t, std::uint64_t>> topics;
980 std::vector<std::uint64_t> counts;
982 topics.reserve(this->getK(isHdp, isIdf));
987 std::size_t topicIndex{};
989 for(
const auto count : counts) {
990 if(!isHdp || this->isLiveTopic(isIdf, topicIndex)) {
991 topics.emplace_back(topicIndex, count);
997 std::sort(topics.begin(), topics.end(), [](
const auto& a,
const auto& b) {
998 return a.second > b.second;
1016 this->checkModel(
"getLogLikelihoodPerToken", isHdp, isIdf);
1017 this->checkTrained(
"getLogLikelihoodPerToken");
1035 this->checkModel(
"getTokenEntropy", isHdp, isIdf);
1036 this->checkTrained(
"getTokenEntropy");
1038 std::vector<std::uint64_t> vocabularyFrequencies;
1039 std::uint64_t vocabularyUsed{};
1048 const auto frequencySum{
1050 vocabularyFrequencies.begin(),
1051 vocabularyFrequencies.begin() + vocabularyUsed,
1056 std::vector<double> normalizedFrequencies;
1058 normalizedFrequencies.reserve(vocabularyUsed);
1061 auto it{vocabularyFrequencies.begin()};
1062 it < vocabularyFrequencies.begin() + vocabularyUsed;
1065 normalizedFrequencies.push_back(static_cast<double>(*it) / frequencySum);
1068 return std::accumulate(
1069 normalizedFrequencies.begin(),
1070 normalizedFrequencies.end(),
1072 [](
double a,
double b) {
1073 return a + b * std::log(b);
1099 this->checkModel(
"getTopicTopNTokens", isHdp, isIdf);
1100 this->checkTrained(
"getTopicTopNTokens");
1102 std::vector<std::pair<tomoto::Vid, float>> tokenIds;
1107 std::vector<std::pair<std::string, float>> tokens;
1111 for(
const auto& tokenId : tokenIds) {
1112 tokens.emplace_back(this->dictLookUp(tokenId.first), tokenId.second);
1141 this->checkModel(
"getTopicTopNLabels", isHdp, isIdf);
1142 this->checkTrained(
"getTopicTopNLabels");
1145 return std::vector<std::pair<std::string, float>>{};
1148 if(!(this->labeler)) {
1150 "getTopicTopNLabels():" 1151 " Topics have not been labeled" 1155 return this->labeler->getLabels(topic, n);
1181 std::unordered_set<std::string>& done
1186 this->checkModel(
"getDocumentsTopics", isHdp, isIdf);
1187 this->checkTrained(
"getDocumentsTopics");
1189 std::vector<std::pair<std::string, std::vector<float>>> results;
1192 for(std::size_t docId{}; docId < total; ++docId) {
1193 const tomoto::DocumentBase * doc{
nullptr};
1197 if(doc->docUid.empty()) {
1201 const auto inserted{done.insert(doc->docUid)};
1203 if(inserted.second) {
1204 results.emplace_back(
1206 this->getInferredTopics(isHdp, isIdf, doc)
1210 TopicModel::validateLastResults(results, done, inserted.first);
1240 const std::vector<std::vector<std::string>>& documents,
1241 std::size_t maxIterations,
1242 std::size_t numberOfWorkers
1247 this->checkModel(
"getDocumentsTopics", isHdp, isIdf);
1248 this->checkTrained(
"getDocumentsTopics");
1251 std::vector<std::unique_ptr<tomoto::DocumentBase>> docUPtrs(documents.size());
1252 std::size_t docIndex{};
1254 for(
const auto& tokens : documents) {
1261 TopicModel::createDocument(
1262 "doc" + std::to_string(docIndex),
1269 if(!(docUPtrs[docIndex])) {
1271 "getDocumentsTopics():" 1272 " Could not create document 'doc" 1273 + std::to_string(docIndex)
1282 std::vector<tomoto::DocumentBase *> docPtrs(documents.size(),
nullptr);
1284 std::transform(docUPtrs.begin(), docUPtrs.end(), docPtrs.begin(), [](
const auto& uPtr) {
1298 tomoto::ParallelScheme::default_,
1302 std::vector<std::vector<float>> results;
1304 results.reserve(documents.size());
1306 for(
const auto * doc : docPtrs) {
1307 results.emplace_back(this->getInferredTopics(isHdp, isIdf, doc));
1326 this->checkModel(
"getModelInfo", isHdp, isIdf);
1327 this->checkTrained(
"getModelInfo");
1349 information.
seed = this->seed;
1360 information.
gamma = this->getGamma(isIdf);
1368 for(std::size_t topic{}; topic < information.
numberOfTopics; ++topic) {
1370 information.
alphas.push_back(this->ldaModelIdf->getAlpha(topic));
1373 information.
alphas.push_back(this->ldaModel->getAlpha(topic));
1396 "setFixedNumberOfTopics",
1397 "Fixed number of topics cannot be set" 1400 this->fixedNumberOfTopics = k;
1414 "Term weighting cannot be set to IDF" 1417 this->isUseIdf = idf;
1432 this->initModel(isHdp, isIdf);
1433 this->checkNotTrained(
1434 "setBurnInIteration",
1435 "Iterations cannot be burned" 1461 std::size_t collectionFrequency,
1462 std::size_t documentFrequency,
1463 std::size_t fixedNumberOfTopTokens
1465 this->checkNotTrained(
1467 "Stopword settings cannot be changed" 1470 this->minTokenCf = collectionFrequency;
1471 this->minTokenDf = documentFrequency;
1472 this->removeTopNTokens = fixedNumberOfTopTokens;
1496 std::size_t initialTopics,
1502 "setInitialParameters",
1503 "Cannot set initial parameters" 1506 this->numberOfInitialTopics = initialTopics;
1507 this->initialAlpha = alpha;
1508 this->initialEta = eta;
1509 this->initialGamma = gamma;
1523 "setParameterOptimizationInterval",
1524 "Cannot set parameter optimization interval" 1527 this->optimizationInterval = interval;
1540 "setRandomNumberGenerationSeed",
1541 "Cannot set seed for random number generation" 1544 this->seed = newSeed;
1586 std::size_t minLength,
1587 std::size_t maxLength,
1588 std::size_t maxCandidates,
1591 std::uint64_t windowSize
1593 this->isLabeling = activate;
1594 this->labelingMinCf = minCf;
1595 this->labelingMinDf = minDf;
1596 this->labelingMinLength = minLength;
1597 this->labelingMaxLength = maxLength;
1598 this->labelingMaxCandidates = maxCandidates;
1599 this->labelingSmoothing = smoothing;
1600 this->labelingMu = mu;
1601 this->labelingWindowSize = windowSize;
1605 this->
label(this->workersUsed);
1635 const std::string& name,
1636 const std::vector<std::string>& tokens,
1637 std::size_t firstToken,
1638 std::size_t numTokens
1643 this->initModel(isHdp, isIdf);
1644 this->checkNotTrained(
1646 "Documents cannot be added" 1650 this->docNames.emplace_back(name);
1657 TopicModel::createDocument(
1658 this->docNames.back(),
1665 if(!(this->hasDocs)) {
1666 this->hasDocs = numTokens > 0;
1684 this->checkModel(
"startTraining", isHdp, isIdf);
1685 this->prepareModel(isHdp, isIdf);
1686 this->trainModel(isHdp, isIdf, 0, 1);
1708 std::size_t iterations,
1714 this->checkModel(
"train", isHdp, isIdf);
1715 this->prepareModel(isHdp, isIdf);
1716 this->trainModel(isHdp, isIdf, iterations, threads);
1737 if(!(this->isLabeling)) {
1738 this->labeler.reset();
1746 this->checkModel(
"label", isHdp, isIdf);
1747 this->checkTrained(
"label");
1749 this->workersUsed = threads;
1752 PMIExtractor extractor(
1753 this->labelingMinCf,
1754 this->labelingMinDf,
1755 this->labelingMinLength,
1756 this->labelingMaxLength,
1757 this->labelingMaxCandidates
1760 const auto * interfacePtr{
1761 static_cast<const ITopicModel *
>(this->
get(isHdp, isIdf))
1764 auto labelCandidates{extractor.extract(interfacePtr)};
1767 constexpr
auto LAMBDA{0.2F};
1769 this->labeler = std::make_unique<FoRelevance>(
1771 labelCandidates.begin(),
1772 labelCandidates.end(),
1773 this->labelingMinDf,
1774 this->labelingSmoothing,
1777 this->labelingWindowSize == 0 ?
1778 std::numeric_limits<std::size_t>::max()
1779 : this->labelingWindowSize,
1811 std::ifstream in(fileName.c_str(), std::ios::binary);
1815 "TopicModel::load():" 1816 " Could not read from '" 1823 TopicModel::readModelFileHead(in, fileName);
1826 TopicModel::readModelFileTermWeighting(in, fileName, isIdf);
1831 TopicModel::readModelFileType(in, fileName);
1834 TopicModel::resetStream(in);
1837 std::vector<uint8_t> data;
1839 this->initModel(isHdp, isIdf);
1860 TopicModel::resetStream(in);
1863 this->initModel(isHdp, isIdf);
1876 const auto bytesRead{in.tellg()};
1882 this->loadModelInformation(isHdp, isIdf, data);
1908 this->checkModel(
"save", isHdp, isIdf);
1909 this->checkTrained(
"save");
1912 std::ofstream out(fileName.c_str(), std::ios::binary);
1914 if(!out.is_open()) {
1916 "TopicModel::save():" 1917 " Could not write to '" 1924 std::vector<uint8_t> data;
1926 this->writeModelInformation(isHdp, isIdf, data);
1933 const auto bytesWritten{out.tellp()};
1938 return bytesWritten;
1950 this->hdpModel.reset();
1951 this->hdpModelIdf.reset();
1952 this->ldaModel.reset();
1953 this->ldaModelIdf.reset();
1957 this->hasDocs =
false;
1958 this->isPrepared =
false;
1960 this->fixedNumberOfTopics = 0;
1965 this->seed = std::random_device{}();
1966 this->minTokenCf = 0;
1967 this->minTokenDf = 0;
1968 this->removeTopNTokens = 0;
1971 this->trainedWithVersion.clear();
1973 this->labeler.reset();
1975 if(labelingOptions) {
1976 this->isLabeling =
false;
1977 this->labelingMinCf = 0;
1978 this->labelingMinDf = 0;
1979 this->labelingMinLength = 0;
1980 this->labelingMaxLength = 0;
1981 this->labelingMaxCandidates = 0;
1982 this->labelingSmoothing = 0.F;
1983 this->labelingMu = 0.F;
1984 this->labelingWindowSize = 0;
1993 inline void TopicModel::initModel(
bool& isHdpTo,
bool& isIdfTo) {
1996 && !(this->hdpModelIdf)
1997 && !(this->ldaModel)
1998 && !(this->ldaModelIdf)
2000 if(this->fixedNumberOfTopics == 0) {
2001 if(this->isUseIdf) {
2002 this->hdpModelIdf = std::make_unique<HDPModelIDF>(
2003 this->numberOfInitialTopics,
2011 this->hdpModel = std::make_unique<HDPModel>(
2012 this->numberOfInitialTopics,
2020 else if(this->isUseIdf) {
2021 this->ldaModelIdf = std::make_unique<LDAModelIDF>(
2022 this->fixedNumberOfTopics,
2029 this->ldaModel = std::make_unique<LDAModel>(
2030 this->fixedNumberOfTopics,
2038 if(this->hdpModel) {
2042 else if(this->hdpModelIdf) {
2046 else if(this->ldaModel) {
2050 else if(this->ldaModelIdf){
2056 "TopicModel::initModel():" 2057 " No model has been loaded." 2063 inline std::string TopicModel::dictLookUp(tomoto::Vid tokenId)
const {
2067 this->checkModel(
"dictLookUp", isHdp, isIdf);
2068 this->checkTrained(
"dictLookUp");
2072 return this->getDict(isHdp, isIdf).toWord(tokenId);
2076 inline void TopicModel::checkModel(
2077 const std::string&
function,
2082 if(this->hdpModel) {
2089 if(this->hdpModelIdf) {
2096 if(this->ldaModel) {
2103 if(this->ldaModelIdf) {
2114 +
"(): No documents have been added" 2115 " or the model has already been cleared" 2120 inline void TopicModel::checkNoModel(
2121 const std::string&
function,
2122 const std::string& errorMsg
2126 || this->hdpModelIdf
2128 || this->ldaModelIdf
2135 +
" after the model has been initialized" 2141 inline void TopicModel::checkTrained(
const std::string&
function)
const {
2142 if(!(this->isPrepared)) {
2146 +
"(): The model has not yet been trained" 2152 inline void TopicModel::checkNotTrained(
2153 const std::string&
function,
2154 const std::string& errorMsg
2156 if(this->isPrepared) {
2162 +
" after the model has already been trained" 2168 inline const tomoto::Dictionary& TopicModel::getDict(
bool isHdp,
bool isIdf)
const {
2174 inline std::size_t TopicModel::getLiveK(
bool isIdf)
const {
2176 return this->hdpModelIdf->getLiveK();
2179 return this->hdpModel->getLiveK();
2183 inline std::size_t TopicModel::getK(
bool isHdp,
bool isIdf)
const {
2185 return this->fixedNumberOfTopics;
2193 inline bool TopicModel::isLiveTopic(
bool isIdf, std::size_t topic)
const {
2195 return this->hdpModelIdf->isLiveTopic(topic);
2198 return this->hdpModel->isLiveTopic(topic);
2202 inline float TopicModel::getGamma(
bool isIdf)
const {
2204 return this->hdpModelIdf->getGamma();
2207 return this->hdpModel->getGamma();
2211 inline std::size_t TopicModel::getNumberOfTables(
bool isIdf)
const {
2213 return this->hdpModelIdf->getTotalTables();
2216 return this->hdpModel->getTotalTables();
2220 inline void TopicModel::prepareModel(
bool isHdp,
bool isIdf) {
2221 if(!(this->isPrepared)) {
2230 this->removeTopNTokens
2233 this->isPrepared =
true;
2238 inline void TopicModel::trainModel(
bool isHdp,
bool isIdf, std::size_t iterations, std::size_t threads) {
2246 tomoto::ParallelScheme::default_
2251 inline void TopicModel::loadModelInformation(
2254 const std::vector<std::uint8_t>& data
2259 TopicModel::numberFromDict(dict,
"min_cf", this->minTokenCf);
2260 TopicModel::numberFromDict(dict,
"min_df", this->minTokenDf);
2261 TopicModel::numberFromDict(dict,
"rm_top", this->removeTopNTokens);
2262 TopicModel::numberFromDict(dict,
"initial_k", this->numberOfInitialTopics);
2263 TopicModel::numberFromDict(dict,
"k", this->fixedNumberOfTopics);
2264 TopicModel::numberFromDict(dict,
"seed", this->seed);
2266 TopicModel::floatFromDict(dict,
"alpha", this->initialAlpha);
2267 TopicModel::floatFromDict(dict,
"eta", this->initialEta);
2268 TopicModel::floatFromDict(dict,
"gamma", this->initialGamma);
2270 TopicModel::stringFromDict(dict,
"version", this->trainedWithVersion);
2273 std::size_t iterations{};
2278 this->hasDocs =
true;
2280 if(iterations > 0) {
2281 this->isPrepared =
true;
2288 inline void TopicModel::writeModelInformation(
2291 std::vector<std::uint8_t>& dataTo
2298 static_cast<std::int64_t>(
2299 isIdf ? tomoto::TermWeight::idf : tomoto::TermWeight::one
2303 dict.
setNumber(
"min_cf", this->minTokenCf);
2304 dict.
setNumber(
"min_df", this->minTokenDf);
2305 dict.
setNumber(
"rm_top", this->removeTopNTokens);
2308 dict.
setNumber(
"initial_k", this->numberOfInitialTopics);
2311 dict.
setNumber(
"k", this->fixedNumberOfTopics);
2316 dict.
setFloat(
"alpha", this->initialAlpha);
2317 dict.
setFloat(
"eta", this->initialEta);
2320 dict.
setFloat(
"gamma", this->initialGamma);
2323 dict.
setString(
"version", this->trainedWithVersion);
2330 inline std::vector<float> TopicModel::getInferredTopics(
2333 const tomoto::DocumentBase * doc
2336 std::vector<float> topics;
2339 return TopicModel::removeDeadTopics(
2340 this->hdpModelIdf->getTopicsByDoc(
2341 *dynamic_cast<
const tomoto::DocumentHDP<tomoto::TermWeight::idf> *>(doc)
2347 return TopicModel::removeDeadTopics(
2348 this->hdpModel->getTopicsByDoc(
2349 *dynamic_cast<
const tomoto::DocumentHDP<tomoto::TermWeight::one> *>(doc)
2356 return this->ldaModelIdf->getTopicsByDoc(
2357 *
dynamic_cast<const tomoto::DocumentLDA<tomoto::TermWeight::idf> *
>(doc)
2361 return this->ldaModel->getTopicsByDoc(
2362 *
dynamic_cast<const tomoto::DocumentLDA<tomoto::TermWeight::one> *
>(doc)
2367 inline const void * TopicModel::get(
bool isHdp,
bool isIdf)
const {
2370 return this->hdpModelIdf.get();
2373 return this->hdpModel.get();
2377 return this->ldaModelIdf.get();
2380 return this->ldaModel.get();
2384 inline tomoto::RawDoc TopicModel::createDocument(
2385 const std::string& name,
2386 const std::vector<std::string>& tokens,
2387 std::size_t firstToken,
2388 std::size_t numTokens
2391 const auto documentEnd{firstToken + numTokens};
2393 doc.rawWords.reserve(numTokens);
2395 for(std::size_t tokenIndex{firstToken}; tokenIndex < documentEnd; ++tokenIndex) {
2396 doc.rawWords.emplace_back(tokens.at(tokenIndex));
2400 doc.docUid = tomoto::SharedString(name);
2406 inline void TopicModel::readModelFileHead(std::istream& in,
const std::string& fileName) {
2413 "TopicModel::load():" 2414 " Invalid model file or unsupported model type in '" 2416 +
"' (first bytes do not match tomoto's LDA model format: '" 2417 + TopicModel::bytesToString(headBytes)
2424 inline void TopicModel::readModelFileTermWeighting(std::istream& in,
const std::string& fileName,
bool& isIdfTo) {
2425 std::array<char, modelFileTermWeightingLen> twBytes{};
2437 "TopicModel::load():" 2438 " Invalid model file or unsupported term weighting scheme in '" 2440 +
"' (term weighting scheme does not match 'one' or 'idf':" 2441 + TopicModel::bytesToString(twBytes)
2448 inline void TopicModel::readModelFileType(std::istream& in,
const std::string& fileName) {
2455 "TopicModel::load():" 2456 " Invalid model file '" 2458 +
"' (type does not match tomoto's model format: '" 2459 + TopicModel::bytesToString(typeBytes)
2466 inline void TopicModel::resetStream(std::istream& in) {
2468 in.seekg(0, std::ios_base::beg);
2472 inline void TopicModel::numberFromDict(
const PickleDict& dict,
const std::string& key, std::size_t& valueTo) {
2478 valueTo =
static_cast<std::size_t
>(*entry);
2486 inline void TopicModel::floatFromDict(
const PickleDict& dict,
const std::string& key,
float& valueTo) {
2492 valueTo =
static_cast<float>(*entry);
2500 inline void TopicModel::stringFromDict(
const PickleDict& dict,
const std::string& key, std::string& valueTo) {
2506 valueTo = std::move(*entry);
2514 inline void TopicModel::validateLastResults(
2515 std::vector<std::pair<std::string, std::vector<float>>>& results,
2516 std::unordered_set<std::string>& done,
2517 const std::unordered_set<std::string>::const_iterator& inserted
2521 results.back().second.begin(),
2522 results.back().second.end(),
2523 [](
const auto value) {
2524 return std::isnan(value);
2529 done.erase(inserted);
std::unordered_map< std::string, std::size_t > getDocuments() const
Gets a map with the documents and their indices from the model.
Definition: TopicModel.hpp:607
constexpr auto modelFileTermWeightingIdf
The term weighting scheme IDF (tf-idf) as saved in a model file.
Definition: TopicModel.hpp:216
std::size_t minCollectionFrequency
Minimum collection frequency of tokens.
Definition: TopicModelInfo.hpp:124
constexpr auto modelFileTermWeightingLen
The number of bytes determining the term weighting scheme in a model file.
Definition: TopicModel.hpp:210
std::size_t save(const std::string &fileName, bool full) const
Writes the model to a file.
Definition: TopicModel.hpp:1904
std::size_t getNumberOfTokens() const
Gets the number of tokens after training has begun.
Definition: TopicModel.hpp:700
void train(std::size_t iterations, std::size_t threads)
Trains the underlying HLDA model.
Definition: TopicModel.hpp:1707
std::size_t numberOfBurnInSteps
The number of initially skipped, i.e. burn-in, steps.
Definition: TopicModelInfo.hpp:108
std::size_t numberOfTopics
The number of topics.
Definition: TopicModelInfo.hpp:171
double tokenEntropy
The entropy of tokens in the model.
Definition: TopicModelInfo.hpp:95
std::size_t sizeOfVocabulary
Definition: TopicModelInfo.hpp:89
std::vector< std::pair< std::string, float > > getTopicTopNLabels(std::size_t topic, std::size_t n) const
Gets the top N labels for the specified topic.
Definition: TopicModel.hpp:1134
std::optional< double > getFloat(const std::string &key) const
Gets a floating-point number from the dictionary, if avaible.
Definition: PickleDict.hpp:426
void setLabelingOptions(bool activate, std::size_t minCf, std::size_t minDf, std::size_t minLength, std::size_t maxLength, std::size_t maxCandidates, float smoothing, float mu, std::size_t windowSize)
Sets the options for automated topic labeling.
Definition: TopicModel.hpp:1582
void setFixedNumberOfTopics(std::size_t k)
Sets the fixed number of topics.
Definition: TopicModel.hpp:1394
Structure containing information about the currently trained Hierarchical Dirichlet Process (HDP) mod...
Definition: TopicModelInfo.hpp:72
TopicModelInfo getModelInfo() const
Gets information about the model after training.
Definition: TopicModel.hpp:1322
Simple Python pickle dictionary.
Definition: PickleDict.hpp:136
void setParameterOptimizationInterval(std::size_t interval)
Sets the interval for parameter optimization, in iterations.
Definition: TopicModel.hpp:1521
std::size_t numberOfTopTokensToBeRemoved
The number of top tokens to be removed.
Definition: TopicModelInfo.hpp:130
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::size_t getIterations() const
Get the number of training iterations performed so far.
Definition: TopicModel.hpp:740
std::vector< std::size_t > getTopics() const
Gets the IDs of the topics.
Definition: TopicModel.hpp:931
std::size_t sizeOfVocabularyUsed
Definition: TopicModelInfo.hpp:92
constexpr auto modelFileHead
The beginning of a valid model file containing a LDA (or HDP) model.
Definition: TopicModel.hpp:207
void label(std::size_t threads)
Labels the resulting topics.
Definition: TopicModel.hpp:1736
void setInitialParameters(std::size_t initialTopics, float alpha, float eta, float gamma)
Sets the initial parameters for the model.
Definition: TopicModel.hpp:1495
void startTraining()
Starts training without performing any iteration.
Definition: TopicModel.hpp:1680
float initialAlpha
The initial concentration coefficient of the Dirichlet Process for document–table.
Definition: TopicModelInfo.hpp:136
void setNumber(const std::string &key, std::int64_t value)
Adds or overwrite a number in the dictionary.
Definition: PickleDict.hpp:467
std::size_t getDocumentId(const std::string &name) const
Gets the ID of the document with the specified name.
Definition: TopicModel.hpp:842
std::vector< std::string > removedTokens
The top tokens removed before training.
Definition: TopicModelInfo.hpp:98
void setFloat(const std::string &key, double value)
Adds or overwrites a floating-point number in the dictionary.
Definition: PickleDict.hpp:480
float alpha
The concentration coeficient of the Dirichlet Process for document-table (HDP only).
Definition: TopicModelInfo.hpp:155
std::size_t getNumberOfDocuments() const
Gets the number of added documents after training has begun.
Definition: TopicModel.hpp:587
std::size_t getBurnInIterations() const
Get the number of skipped iterations.
Definition: TopicModel.hpp:720
Class for topic modelling-specific exceptions.
Definition: TopicModel.hpp:376
std::string getTomotoVersion()
Gets the version of the tomoto library if available.
Definition: Versions.hpp:341
std::size_t numberOfTokens
The number of tokens in the model.
Definition: TopicModelInfo.hpp:86
std::size_t numberOfIterations
The number of iterations performed.
Definition: TopicModelInfo.hpp:105
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
std::size_t getParameterOptimizationInterval() const
Gets the interval for parameter optimization, in iterations.
Definition: TopicModel.hpp:760
void writeTo(Bytes &dataTo) const
Writes dictionary to Python pickle data.
Definition: PickleDict.hpp:565
std::optional< std::int64_t > getNumber(const std::string &key) const
Gets a number from the dictionary, if avaible.
Definition: PickleDict.hpp:406
constexpr auto defaultEta
The default hyperparameter for the Dirichlet distribution for topic-token.
Definition: TopicModel.hpp:194
std::size_t minDocumentFrequency
Minimum document frequency of tokens.
Definition: TopicModelInfo.hpp:127
double getLogLikelihoodPerToken() const
Gets the log-likelihood per token.
Definition: TopicModel.hpp:1012
void setTokenRemoval(std::size_t collectionFrequency, std::size_t documentFrequency, std::size_t fixedNumberOfTopTokens)
Sets which (un)common tokens to remove before training.
Definition: TopicModel.hpp:1460
constexpr auto defaultOptimizationInterval
The default interval for optimizing the parameters, in iterations.
Definition: TopicModel.hpp:204
constexpr auto ldaModelName
The name of the LDA model.
Definition: TopicModel.hpp:185
std::string modelName
The name of the model.
Definition: TopicModelInfo.hpp:77
void setRandomNumberGenerationSeed(std::size_t newSeed)
Sets the seed for random number generation.
Definition: TopicModel.hpp:1538
std::size_t numberOfTables
The number of tables.
Definition: TopicModelInfo.hpp:178
float initialEta
The initial hyperparameter for the Dirichlet distribution for topic–token.
Definition: TopicModelInfo.hpp:139
double getTokenEntropy() const
Gets the token entropy after training.
Definition: TopicModel.hpp:1031
std::string weighting
Term weighting mode as string.
Definition: TopicModelInfo.hpp:121
const std::vector< std::string > & getVocabulary() const
Gets the complete dictionary used by the model.
Definition: TopicModel.hpp:680
#define DATA_TOPICMODEL_RETURN(isHdp, isIdf, function)
Definition: TopicModel.hpp:155
std::optional< std::string > getString(const std::string &key) const
Gets a string from the dictionary, if avaible.
Definition: PickleDict.hpp:446
std::size_t getVocabularySize() const
Gets the number of distinct tokens after training has begun.
Definition: TopicModel.hpp:639
std::string_view getModelName() const
Gets the name of the current model.
Definition: TopicModel.hpp:799
constexpr auto defaultNumberOfInitialTopics
The initial number of topics by default.
Definition: TopicModel.hpp:188
constexpr auto defaultGamma
The default concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModel.hpp:201
#define DATA_TOPICMODEL_RETRIEVE(x, isHdp, isIdf, function,...)
Definition: TopicModel.hpp:135
constexpr auto modelFileTermWeightingOne
The term weighting scheme ONE as saved in a model file.
Definition: TopicModel.hpp:213
std::vector< float > alphas
The Dirichlet priors on the per-document topic distributions (LDA only).
Definition: TopicModelInfo.hpp:158
std::vector< std::pair< std::string, std::vector< float > > > getDocumentsTopics(std::unordered_set< std::string > &done) const
Gets the topic distributions of all documents the model has been trained on, if available.
Definition: TopicModel.hpp:1180
std::size_t seed
The initial seed for random number generation.
Definition: TopicModelInfo.hpp:145
constexpr auto hdpModelName
The name of the HDP model.
Definition: TopicModel.hpp:182
std::size_t numberOfDocuments
The number of documents in the model.
Definition: TopicModelInfo.hpp:83
void setBurnInIteration(std::size_t skipIterations)
Sets the number of iterations that will be skipped at the beginnig of training.
Definition: TopicModel.hpp:1428
std::size_t load(const std::string &fileName)
Loads a model from a file.
Definition: TopicModel.hpp:1804
std::string trainedWithVersion
The version of the modeller the model has been trained with.
Definition: TopicModelInfo.hpp:148
Topic modeller.
Definition: TopicModel.hpp:257
void clear(bool labelingOptions)
Clears the model, resets its settings and frees memory.
Definition: TopicModel.hpp:1949
float initialGamma
The initial concentration coefficient of the Dirichlet Process for table–topic.
Definition: TopicModelInfo.hpp:142
double logLikelihoodPerToken
The log-likelihood per token.
Definition: TopicModelInfo.hpp:114
constexpr auto modelFileType
The tomoto file format as saved in a model file (after model head and term weighting scheme)...
Definition: TopicModel.hpp:219
std::vector< std::pair< std::size_t, std::uint64_t > > getTopicsSorted() const
Gets the IDs and counts of the topics, sorted by count.
Definition: TopicModel.hpp:972
float eta
The Dirichlet prior on the per-topic token distribution (HDP only).
Definition: TopicModelInfo.hpp:161
constexpr auto defaultAlpha
The default concentration coeficient of the Dirichlet Process for document-table. ...
Definition: TopicModel.hpp:191
std::string modelVersion
The version of the model (as string).
Definition: TopicModelInfo.hpp:80
std::string_view getTermWeighting() const
Gets the term weighting mode of the current model.
Definition: TopicModel.hpp:821
void setUseIdf(bool idf)
Sets whether to use IDF term weighting.
Definition: TopicModel.hpp:1411
Namespace for different types of data.
#define DATA_TOPICMODEL_CALL(isHdp, isIdf, function,...)
Definition: TopicModel.hpp:96
std::vector< std::string > getRemovedTokens() const
Gets the most common tokens (i.e. stopwords) that have been removed.
Definition: TopicModel.hpp:875
float gamma
The concentration coefficient of the Dirichlet Process for table-topic.
Definition: TopicModelInfo.hpp:168
std::size_t getOriginalVocabularySize() const
Gets the number of distinct tokens before training.
Definition: TopicModel.hpp:659
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
std::size_t getNumberOfTopics() const
Gets the number of topics.
Definition: TopicModel.hpp:906
std::size_t getRandomNumberGenerationSeed() const
Gets the seed used for random number generation.
Definition: TopicModel.hpp:780
std::size_t numberOfInitialTopics
The initial number of topics, which will be adjusted for the data during training.
Definition: TopicModelInfo.hpp:133
#define DATA_TOPICMODEL_RETRIEVE_NOARGS(x, isHdp, isIdf, function)
Definition: TopicModel.hpp:116
std::size_t optimizationInterval
The optimization interval.
Definition: TopicModelInfo.hpp:111
std::vector< std::pair< std::string, float > > getTopicTopNTokens(std::size_t topic, std::size_t n) const
Gets the top N tokens for the specified topic.
Definition: TopicModel.hpp:1092
void addDocument(const std::string &name, const std::vector< std::string > &tokens, std::size_t firstToken, std::size_t numTokens)
Adds a document from a tokenized corpus.
Definition: TopicModel.hpp:1634
void setString(const std::string &key, const std::string &value)
Add or overwrites a string in the dictionary.
Definition: PickleDict.hpp:493