crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Config.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Config.hpp
24  *
25  * Analyzing configuration.
26  *
27  * Created on: Oct 25, 2018
28  * Author: ans
29  */
30 
31 #ifndef MODULE_ANALYZER_CONFIG_HPP_
32 #define MODULE_ANALYZER_CONFIG_HPP_
33 
34 #include "../Config.hpp"
35 
36 #include <algorithm> // std::min
37 #include <cstdint> // std::int32_t, std::uint8_t, std::std::uint64_t
38 #include <string> // std::string
39 #include <vector> // std::vector
40 
43 
44  /*
45  * CONSTANTS
46  */
47 
50 
52  inline constexpr std::uint8_t generalInputSourcesParsing{0};
53 
55  inline constexpr std::uint8_t generalInputSourcesExtracting{1};
56 
58  inline constexpr std::uint8_t generalInputSourcesAnalyzing{2};
59 
61  inline constexpr std::uint8_t generalInputSourcesCrawling{3};
62 
64  inline constexpr std::uint8_t generalLoggingSilent{0};
65 
67  inline constexpr std::uint8_t generalLoggingDefault{1};
68 
70  inline constexpr std::uint8_t generalLoggingExtended{2};
71 
73  inline constexpr std::uint8_t generalLoggingVerbose{3};
74 
76  inline constexpr std::int32_t defaultRestartAfter{-1};
77 
79  inline constexpr std::uint64_t defaultSleepMySqlS{60};
80 
82  inline constexpr std::uint64_t defaultSleepWhenFinishedMs{5000};
83 
85  inline constexpr auto minPercentageCorpusSlices{1};
86 
88  inline constexpr auto maxPercentageCorpusSlices{99};
89 
91  inline constexpr auto defaultPercentageCorpusSlices{30};
92 
94  inline constexpr auto defaultFreeMemoryEvery{100000000};
95 
97 
98  /*
99  * DECLARATION
100  */
101 
103  class Config : protected Module::Config {
104  public:
105  /*
106  * needs virtual (i.e. overridden) default destructor, because of virtual member functions
107  * -> needs deleted copy and move contructors/operators
108  * -> needs default constructor
109  */
110 
113 
115  Config() = default;
116 
118  ~Config() override = default;
119 
123 
125 
130  struct Entries {
133 
136 
139 
141  std::vector<std::string> generalInputFields;
142 
144 
150  std::vector<std::uint8_t> generalInputSources;
151 
153  std::vector<std::string> generalInputTables;
154 
157 
160 
163 
166 
168  std::string generalTargetTable;
169 
173 
175  bool groupDateFillGaps{true};
176 
178 
187  std::uint8_t groupDateResolution{};
188 
192 
194  bool filterDateEnable{false};
195 
197  std::string filterDateFrom;
198 
200  std::string filterDateTo;
201 
205 
207 
210  std::vector<std::uint64_t> filterQueryQueries;
211 
213  bool filterQueryAll{false};
214 
218 
220 
227  std::vector<std::string> tokenizerDicts;
228 
230 
235 
237 
244  std::vector<std::string> tokenizerLanguages;
245 
247 
257  std::vector<std::uint16_t> tokenizerManipulators;
258 
260 
267  std::vector<std::string> tokenizerModels;
268 
270 
279  std::vector<std::uint16_t> tokenizerSavePoints{};
280 
282 
289  std::string uploadFTP;
290 
292 
295  std::string uploadProxy;
296 
298 
301  std::string uploadTargetColumn;
302 
304  bool uploadVerbose{false};
305 
307  }
308 
310  config;
311 
313 
317 
320  Config(Config&) = delete;
321 
323  Config& operator=(Config&) = delete;
324 
326  Config(Config&&) = delete;
327 
329  Config& operator=(Config&&) = delete;
330 
332 
333  protected:
336 
337  void parseOption() override;
338  void checkOptions() override;
339  void reset() override;
340 
344 
346 
350  virtual void parseAlgoOption() = 0;
351 
353 
357  virtual void checkAlgoOptions() = 0;
358 
360 
364  virtual void resetAlgo() = 0;
365 
367  };
368 
369  /*
370  * IMPLEMENTATION
371  */
372 
373  /*
374  * ANALYZER-SPECIFIC CONFIGURATION PARSING
375  */
376 
378  inline void Config::parseOption() {
379  // general options
380  this->category("general");
381  this->option("corpus.checks", this->config.generalCorpusChecks);
382  this->option("corpus.slicing", this->config.generalCorpusSlicing);
383  this->option(
384  "input.fields",
386  StringParsingOption::SQL
387  );
388  this->option("input.sources", this->config.generalInputSources);
389  this->option(
390  "input.tables",
392  StringParsingOption::SQL
393  );
394  this->option("logging", this->config.generalLogging);
395  this->option("restart.after", this->config.generalRestartAfter);
396  this->option("sleep.mysql", this->config.generalSleepMySql);
397  this->option("sleep.when.finished", this->config.generalSleepWhenFinished);
398  this->option(
399  "target.table",
401  StringParsingOption::SQL
402  );
403 
404  // group by date option
405  this->category("group-date");
406  this->option("fill.gaps", this->config.groupDateFillGaps);
407  this->option("resolution", this->config.groupDateResolution);
408 
409  // filter by date options
410  this->category("filter-date");
411  this->option("enable", this->config.filterDateEnable);
412  this->option("from", this->config.filterDateFrom);
413  this->option("to", this->config.filterDateTo);
414 
415  // filter by query option
416  this->category("filter-query");
417  this->option("queries", this->config.filterQueryQueries);
418  this->option("all", this->config.filterQueryAll);
419 
420  // corpus tokenization options
421  this->category("tokenizer");
422  this->option("dicts", this->config.tokenizerDicts);
423  this->option("free.memory.every", this->config.tokenizerFreeMemoryEvery);
424  this->option("languages", this->config.tokenizerLanguages);
425  this->option("manipulators", this->config.tokenizerManipulators);
426  this->option("models", this->config.tokenizerModels);
427  this->option("savepoints", this->config.tokenizerSavePoints);
428 
429  // upload options
430  this->category("upload");
431  this->option("ftp", this->config.uploadFTP);
432  this->option("proxy", this->config.uploadProxy);
433  this->option("target.column", this->config.uploadTargetColumn);
434  this->option("verbose", this->config.uploadVerbose);
435 
436  // parse algo options
437  this->parseAlgoOption();
438  }
439 
441  inline void Config::checkOptions() {
442  // check corpus chunk size (in percent of the maximum packet size allowed by the MySQL server)
443  if(
445  || this->config.generalCorpusSlicing > maxPercentageCorpusSlices
446  ) {
448 
449  this->warning(
450  "Invalid corpus chunk size reset to "
451  + std::to_string(defaultPercentageCorpusSlices)
452  + "% of the maximum packet size allowed by the MySQL server."
453  );
454  }
455 
456  // check properties of input fields
457  const auto completeInputs{
458  std::min({ // number of complete inputs (= min. size of all arrays)
459  this->config.generalInputFields.size(),
460  this->config.generalInputSources.size(),
461  this->config.generalInputTables.size()
462  })
463  };
464 
465  bool incompleteInputs{false};
466 
467  // remove field names that are not used
468  if(this->config.generalInputFields.size() > completeInputs) {
469  this->config.generalInputFields.resize(completeInputs);
470 
471  incompleteInputs = true;
472  }
473 
474  // remove field sources that are not used
475  if(this->config.generalInputSources.size() > completeInputs) {
476  this->config.generalInputSources.resize(completeInputs);
477 
478  incompleteInputs = true;
479  }
480 
481  // remove field source tables that are not used
482  if(this->config.generalInputTables.size() > completeInputs) {
483  // remove sources of incomplete datetime queries
484  this->config.generalInputTables.resize(completeInputs);
485 
486  incompleteInputs = true;
487  }
488 
489  // warn about incomplete input fields
490  if(incompleteInputs) {
491  this->warning(
492  "'input.fields', '.sources' and '.tables'"
493  " should have the same number of elements."
494  );
495 
496  this->warning("Incomplete input field(s) removed from configuration.");
497  }
498 
499  // check number of manipulators and their dictionaries, models, and languages
500  if(
501  this->config.tokenizerDicts.size()
502  > this->config.tokenizerManipulators.size()
503  ) {
504  this->warning(
505  "The configuration contains"
506  " more dictionaries than manipulators,"
507  " redundant dictionaries will be ignored."
508  );
509  }
510 
511  if(
512  this->config.tokenizerModels.size()
513  > this->config.tokenizerManipulators.size()
514  ) {
515  this->warning(
516  "The configuration contains"
517  " more models than manipulators,"
518  " redundant models will be ignored."
519  );
520  }
521 
522  if(
523  this->config.tokenizerLanguages.size()
524  > this->config.tokenizerManipulators.size()
525  ) {
526  this->warning(
527  "The configuration contains"
528  " more languages than manipulators,"
529  " redundant languages will be ignored."
530  );
531  }
532 
533  // resize so that the numbers of models equals the numbers of manipulators
534  this->config.tokenizerDicts.resize(
535  this->config.tokenizerManipulators.size()
536  );
537 
538  this->config.tokenizerModels.resize(
539  this->config.tokenizerManipulators.size()
540  );
541 
542  this->config.tokenizerLanguages.resize(
543  this->config.tokenizerManipulators.size()
544  );
545 
546  // check algo options
547  this->checkAlgoOptions();
548  }
549 
551  inline void Config::reset() {
552  this->config = {};
553 
554  // reset algo options
555  this->resetAlgo();
556  }
557 
558 } /* namespace crawlservpp::Module::Analyzer */
559 
560 #endif /* MODULE_ANALYZER_CONFIG_HPP_ */
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
constexpr std::uint8_t generalInputSourcesParsing
An analyzer uses a parsing table as data source.
Definition: Config.hpp:52
Namespace for analyzer classes.
constexpr auto maxPercentageCorpusSlices
Maximum percentage of the maximum length for corpus slices.
Definition: Config.hpp:88
std::vector< std::string > generalInputTables
Names of tables to be used as input.
Definition: Config.hpp:153
bool uploadVerbose
Specified whether FTP network information will be printed to the server console while uploading the r...
Definition: Config.hpp:304
bool groupDateFillGaps
Enables filling the gaps inbetween dates.
Definition: Config.hpp:175
std::uint64_t tokenizerFreeMemoryEvery
Number of processed bytes in a continuous corpus after which memory will be freed.
Definition: Config.hpp:234
Config & operator=(Config &)=delete
Deleted copy assignment operator.
Abstract class as base for module-specific configurations.
Definition: Config.hpp:122
std::string uploadFTP
URL to upload a JSON file containing the results to.
Definition: Config.hpp:289
std::uint64_t generalSleepWhenFinished
Time (in ms) to wait each tick when finished.
Definition: Config.hpp:165
std::uint8_t generalCorpusSlicing
Corpus chunk size in percent of the maximum allowed package size by the MySQL server.
Definition: Config.hpp:138
std::uint8_t groupDateResolution
The resolution to be used when grouping dates.
Definition: Config.hpp:187
std::vector< std::string > tokenizerLanguages
Language for the (token-based aspell) manipulator with the same array index.
Definition: Config.hpp:244
constexpr std::uint8_t generalLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:73
~Config() override=default
Default destructor.
constexpr std::uint8_t generalLoggingSilent
Logging is disabled.
Definition: Config.hpp:64
constexpr auto minPercentageCorpusSlices
Minimum percentage of the maximum length for corpus slices.
Definition: Config.hpp:85
constexpr auto defaultPercentageCorpusSlices
Default percentage of the maximum length for corpus slices.
Definition: Config.hpp:91
std::vector< std::string > generalInputFields
Columns to be used from the input tables.
Definition: Config.hpp:141
constexpr std::uint64_t defaultSleepMySqlS
Default time (in s) to wait before last try to re-connect to MySQL server.
Definition: Config.hpp:79
bool generalCorpusChecks
Check the consistency of text corpora.
Definition: Config.hpp:135
void parseOption() override
Parses an analyzer-specific configuration option.
Definition: Config.hpp:378
void reset() override
Resets the analyzer-specific configuration options.
Definition: Config.hpp:551
constexpr std::uint64_t defaultSleepWhenFinishedMs
Default time (in ms) to wait each tick when finished.
Definition: Config.hpp:82
constexpr std::uint8_t generalLoggingDefault
Default logging is enabled.
Definition: Config.hpp:67
std::vector< std::uint16_t > tokenizerSavePoints
Steps after which the corpus will be stored in the database.
Definition: Config.hpp:279
std::vector< std::uint64_t > filterQueryQueries
Queries which need to be fulfilled for at least one token in an article in order to keep it...
Definition: Config.hpp:210
bool filterQueryAll
Specifies whether articles must contain a word fulfilling all of the queries instead of only of one o...
Definition: Config.hpp:213
constexpr std::uint8_t generalInputSourcesExtracting
An analyzer uses an extracting table as data source.
Definition: Config.hpp:55
std::string uploadProxy
URL of proxy to use while uploading a JSON file containing the results.
Definition: Config.hpp:295
std::vector< std::uint8_t > generalInputSources
Types of tables to be used as input.
Definition: Config.hpp:150
constexpr std::uint8_t generalInputSourcesCrawling
An analyzer uses a crawling table as data source.
Definition: Config.hpp:61
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::vector< std::string > tokenizerModels
Model for the (sentence-based) manipulator with the same array index.
Definition: Config.hpp:267
std::string generalTargetTable
Table name to save analyzed data to.
Definition: Config.hpp:168
void checkOptions() override
Checks the analyzer-specific configuration options.
Definition: Config.hpp:441
struct crawlservpp::Module::Analyzer::Config::Entries config
Configuration of the analyzer.
std::uint8_t generalLogging
Level of logging activity.
Definition: Config.hpp:156
virtual void checkAlgoOptions()=0
Checks the algorithm-specific configuration.
std::uint64_t generalSleepMySql
Time (in s) to wait before last try to re-connect to mySQL server.
Definition: Config.hpp:162
virtual void parseAlgoOption()=0
Parses an algorithm-specific configuration entry.
constexpr std::int32_t defaultRestartAfter
Default time (in s) after which to restart analysis once it has been completed (-1=deactivated).
Definition: Config.hpp:76
constexpr std::uint8_t generalLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:70
constexpr auto defaultFreeMemoryEvery
Default number of processed bytes in a continuous corpus after which memory will be freed...
Definition: Config.hpp:94
std::string filterDateFrom
The date from which to filter the parsed data.
Definition: Config.hpp:197
Abstract configuration for analyzers, to be implemented by algorithm classes.
Definition: Config.hpp:103
std::vector< std::string > tokenizerDicts
Dictionary for the (token-based) manipulator with the same array index.
Definition: Config.hpp:227
std::string uploadTargetColumn
Name of the column in the target table to create the JSON file for uploading from.
Definition: Config.hpp:301
std::string filterDateTo
The date until which to filter the parsed data.
Definition: Config.hpp:200
Config()=default
Default constructor.
constexpr std::uint8_t generalInputSourcesAnalyzing
An analyzer uses an analyzing table as data source.
Definition: Config.hpp:58
std::int32_t generalRestartAfter
Time (in s) after which to restart analysis once it has been completed (-1=deactivated).
Definition: Config.hpp:159
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
Configuration entries for analyzer threads.
Definition: Config.hpp:130
bool filterDateEnable
Enable filtering source data by date (only applies to parsed data).
Definition: Config.hpp:194
std::vector< std::uint16_t > tokenizerManipulators
Manipulators used on the text corpus.
Definition: Config.hpp:257
virtual void resetAlgo()=0
Resets the algorithm-specific configuration.