crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
AllTokens.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * AllTokens.hpp
24  *
25  * Count all tokens in a corpus.
26  *
27  * Tokens will be counted by date and/or article, if possible.
28  *
29  * Created on: Mar 19, 2021
30  * Author: ans
31  */
32 
33 #ifndef MODULE_ANALYZER_ALGO_ALLTOKENS_HPP_
34 #define MODULE_ANALYZER_ALGO_ALLTOKENS_HPP_
35 
36 #include "../Thread.hpp"
37 
38 #include "../../../Data/Corpus.hpp"
39 #include "../../../Data/Data.hpp"
40 #include "../../../Helper/Memory.hpp"
41 #include "../../../Main/Database.hpp"
42 #include "../../../Struct/StatusSetter.hpp"
43 #include "../../../Struct/TextMap.hpp"
44 #include "../../../Struct/ThreadOptions.hpp"
45 #include "../../../Struct/ThreadStatus.hpp"
46 #include "../../../Timer/Simple.hpp"
47 
48 #include <cstddef> // std::size_t
49 #include <cstdint> // std::uint32_t
50 #include <map> // std::map
51 #include <string> // std::string, std::to_string
52 #include <string_view> // std::string_view
53 #include <utility> // std::pair
54 
56 
57  /*
58  * CONSTANTS
59  */
60 
63 
65  inline constexpr auto allTokensColumns{2};
66 
68  inline constexpr auto allTokensUpdateEveryDate{100U};
69 
71  inline constexpr auto allTokensUpdateEveryArticle{1000U};
72 
74  inline constexpr auto allTokensUpdateEveryToken{10000U};
75 
77  inline constexpr auto allTokensUpdateEveryRow{1000U};
78 
80 
81  /*
82  * DECLARATION
83  */
84 
86 
89  class AllTokens final : public Module::Analyzer::Thread {
90  // for convenience
92 
97 
98  using StringString = std::pair<std::string, std::string>;
99 
100  using TokenMap = std::map<std::string, std::size_t>;
101  using TokenCounts = std::map<std::size_t, std::size_t>;
102  using SingleMap = std::map<std::string, TokenCounts>;
103  using DoubleMap = std::map<std::string, SingleMap>;
104 
105  public:
108 
109  AllTokens(
110  Main::Database& dbBase,
111  const ThreadOptions& threadOptions,
112  const ThreadStatus& threadStatus
113  );
114  AllTokens(
115  Main::Database& dbBase,
116  const ThreadOptions& threadOptions
117  );
118 
122 
123  std::string_view getName() const override;
124 
128 
129  void onAlgoInitTarget() override;
130  void onAlgoInit() override;
131  void onAlgoTick() override;
132  void onAlgoPause() override;
133  void onAlgoUnpause() override;
134  void onAlgoClear() override;
135 
139 
140  void parseAlgoOption() override;
141  void checkAlgoOptions() override;
142  void resetAlgo() override;
143 
145 
146  private:
147  // algorithm options
148  struct Entries {
149  std::string countTable;
150  } algoConfig;
151 
152  // algorithm state
153  enum OrderBy {
154  NONE = 0,
155  ARTICLES,
156  DATES
157  } orderBy{NONE};
158 
159  std::size_t total{};
160  std::size_t articleCount{};
161  std::size_t count{};
162  std::size_t updateCount{};
163  std::size_t countsTable{};
164 
165  bool hasArticles{false};
166  bool done{false};
167  bool firstTick{true};
168 
169  // data
170  TokenMap tokens;
171  TokenCounts tokenCounts;
172  SingleMap singleMap;
173  DoubleMap doubleMap;
174 
175  // algorithm functions
176  void nextDate();
177  void nextArticle();
178  void nextToken();
179  void clearCorpus();
180  void saveData();
181 
182  // internal helper functions
183  void updateProgress(std::uint32_t every);
184  void saveTokens();
185  void saveCounts();
186  void saveDouble();
187  void saveSingle(const std::string& typeName);
188  void saveTokenCounts();
189  void initCountsTable();
190 
191  // static internal helper functions
192  static void processSingle(
193  const std::vector<std::string>& corpusTokens,
194  const TextMapEntry& entry,
195  TokenMap& tokenMap,
196  SingleMap& to
197  );
198  static void processDouble(
199  const TextMapEntry& entry,
200  SingleMap& from,
201  DoubleMap& to
202  );
203  static void processToken(
204  const std::string& token,
205  TokenMap& tokenMap,
206  TokenCounts& to
207  );
208  static void addTokenCounts(
209  const TokenCounts& from,
211  );
212 
213  // internal helper template
214  template<typename T> bool isDone(const T& container) {
215  if(this->count >= container.size()) {
216  this->done = true;
217 
218  return true;
219  }
220 
221  return false;
222  }
223  };
224 
225 } /* namespace crawlservpp::Module::Analyzer::Algo */
226 
227 #endif /* MODULE_ANALYZER_ALGO_ALLTOKENS_HPP_ */
void checkAlgoOptions() override
Checks the configuration options for the algorithm.
Definition: AllTokens.cpp:233
Namespace for algorithm classes.
Definition: All.cpp:52
void onAlgoPause() override
Does nothing.
Definition: AllTokens.cpp:214
constexpr auto allTokensUpdateEveryDate
Indicates after how many dates the status will be updated, if a date map is available.
Definition: AllTokens.hpp:68
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
Text map entry.
Definition: TextMap.hpp:49
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: AllTokens.cpp:222
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
void onAlgoClear() override
Does nothing.
Definition: AllTokens.cpp:219
void resetAlgo() override
Resets the algorithm.
Definition: AllTokens.cpp:248
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
constexpr auto allTokensUpdateEveryArticle
Indicates after how many articles the status will be updated, if no date map, but an article map is a...
Definition: AllTokens.hpp:71
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: AllTokens.cpp:90
Structure for inserting multiple values of different types into a row.
Definition: Data.hpp:360
void onAlgoTick() override
Counts tokens in the current date, article, or token.
Definition: AllTokens.cpp:169
constexpr auto allTokensColumns
The number of columns in the tokens table.
Definition: AllTokens.hpp:65
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
constexpr auto allTokensUpdateEveryRow
Indicates after how many rows the status will be updated while saving the results to the database...
Definition: AllTokens.hpp:77
std::string_view getName() const override
Returns the name of the algorithm.
Definition: AllTokens.cpp:76
void onAlgoInit() override
Initializes the algorithm and processes its input.
Definition: AllTokens.cpp:106
void onAlgoUnpause() override
Unpauses the algorithm.
Definition: AllTokens.cpp:216
Counts all tokens in a corpus.
Definition: AllTokens.hpp:89
AllTokens(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: AllTokens.cpp:45
constexpr auto allTokensUpdateEveryToken
Indicates after how many tokens the status will be updated, if no date and no article map is availabl...
Definition: AllTokens.hpp:74