crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Config.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Config.hpp
24  *
25  * Parsing configuration.
26  *
27  * Created on: Oct 25, 2018
28  * Author: ans
29  */
30 
31 #ifndef MODULE_PARSER_CONFIG_HPP_
32 #define MODULE_PARSER_CONFIG_HPP_
33 
34 #include "../../Main/Exception.hpp"
35 #include "../../Module/Config.hpp"
36 
37 #include <algorithm> // std::min, std::replace_if
38 #include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
39 #include <string> // std::string
40 #include <vector> // std::vector
41 
44 
45  /*
46  * CONSTANTS
47  */
48 
51 
53  inline constexpr std::uint8_t crawlerLoggingVerbose{0};
54 
56  inline constexpr std::uint8_t generalLoggingDefault{1};
57 
59  inline constexpr std::uint8_t generalLoggingExtended{2};
60 
62  inline constexpr std::uint8_t generalLoggingVerbose{3};
63 
65  inline constexpr std::uint8_t parsingSourceUrl{0};
66 
68  inline constexpr std::uint8_t parsingSourceContent{1};
69 
71  inline constexpr std::uint64_t defaultCacheSize{2500};
72 
74  inline constexpr std::uint32_t defaultLockS{300};
75 
77  inline constexpr std::uint16_t defaultMaxBatchSize{500};
78 
80  inline constexpr std::uint64_t defaultSleepIdleMs{5000};
81 
83  inline constexpr std::uint64_t defaultSleepMySqlS{60};
84 
86 
87  /*
88  * DECLARATION
89  */
90 
92  class Config : protected Module::Config {
93  public:
96 
98 
103  struct Entries {
106 
108 
112 
114  std::uint64_t generalDbTimeOut{};
115 
117  std::uint32_t generalLock{defaultLockS};
118 
121 
124 
126  bool generalNewestOnly{true};
127 
129  bool generalParseCustom{false};
130 
132  bool generalReParse{false};
133 
135  std::string generalResultTable;
136 
138  std::vector<std::uint64_t> generalSkip;
139 
142 
145 
147  bool generalTiming{false};
148 
152 
154  std::vector<std::uint64_t> parsingContentIgnoreQueries;
155 
157 
175  std::vector<std::string> parsingDateTimeFormats;
176 
178 
184  std::vector<std::string> parsingDateTimeLocales;
185 
187 
192  std::vector<std::uint64_t> parsingDateTimeQueries;
193 
195 
200  std::vector<std::uint16_t> parsingDateTimeSources;
201 
203 
208 
210 
226  std::vector<std::string> parsingFieldDateTimeFormats;
227 
229 
234  std::vector<std::string> parsingFieldDateTimeLocales;
235 
237 
241  std::vector<char> parsingFieldDelimiters;
242 
244 
247  std::vector<bool> parsingFieldIgnoreEmpty;
248 
250  std::vector<bool> parsingFieldJSON;
251 
253  std::vector<std::string> parsingFieldNames;
254 
256  std::vector<std::uint64_t> parsingFieldQueries;
257 
259 
264  std::vector<std::uint8_t> parsingFieldSources;
265 
267  std::vector<bool> parsingFieldTidyTexts;
268 
270 
274  std::vector<bool> parsingFieldWarningsEmpty;
275 
277  std::vector<std::string> parsingIdIgnore;
278 
280 
287  std::vector<std::uint64_t> parsingIdQueries;
288 
290 
295  std::vector<std::uint8_t> parsingIdSources;
296 
298  bool parsingRepairCData{true};
299 
302 
305 
307 
311  std::uint16_t parsingTidyErrors{};
312 
314 
318  bool parsingTidyWarnings{false};
319 
321  }
322 
324  config;
325 
327 
330 
331  protected:
334 
335  void parseOption() override;
336  void checkOptions() override;
337  void reset() override;
338 
340  };
341 
342  /*
343  * IMPLEMENTATION
344  */
345 
346  /*
347  * PARSER-SPECIFIC CONFIGURATION PARSING
348  */
349 
351  inline void Config::parseOption() {
352  this->category("general");
353  this->option("cache.size", this->config.generalCacheSize);
354  this->option("db.timeout", this->config.generalDbTimeOut);
355  this->option("logging", this->config.generalLogging);
356  this->option("max.batch.size", this->config.generalMaxBatchSize);
357  this->option("newest.only", this->config.generalNewestOnly);
358  this->option("parse.custom", this->config.generalParseCustom);
359  this->option("reparse", this->config.generalReParse);
360  this->option("skip", this->config.generalSkip);
361  this->option("sleep.idle", this->config.generalSleepIdle);
362  this->option("sleep.mysql", this->config.generalSleepMySql);
363  this->option(
364  "target.table",
366  StringParsingOption::SQL
367  );
368  this->option("timing", this->config.generalTiming);
369 
370  this->category("parser");
371  this->option("content.ignore.queries", this->config.parsingContentIgnoreQueries);
372  this->option("datetime.formats", this->config.parsingDateTimeFormats);
373  this->option("datetime.locales", this->config.parsingDateTimeLocales);
374  this->option("datetime.queries", this->config.parsingDateTimeQueries);
375  this->option("datetime.sources", this->config.parsingDateTimeSources);
376  this->option("datetime.warning.empty", this->config.parsingDateTimeWarningEmpty);
377  this->option("field.datetime.formats", this->config.parsingFieldDateTimeFormats);
378  this->option("field.datetime.locales", this->config.parsingFieldDateTimeLocales);
379  this->option(
380  "field.delimiters",
382  CharParsingOption::FromString
383  );
384  this->option("field.ignore.empty", this->config.parsingFieldIgnoreEmpty);
385  this->option("field.json", this->config.parsingFieldJSON);
386  this->option(
387  "field.names",
389  StringParsingOption::SQL
390  );
391  this->option("field.queries", this->config.parsingFieldQueries);
392  this->option("field.sources", this->config.parsingFieldSources);
393  this->option("field.tidy.texts", this->config.parsingFieldTidyTexts);
394  this->option("field.warnings.empty", this->config.parsingFieldWarningsEmpty);
395  this->option("id.ignore", this->config.parsingIdIgnore);
396  this->option("id.queries", this->config.parsingIdQueries);
397  this->option("id.sources", this->config.parsingIdSources);
398  this->option("remove.xml.instructions", this->config.parsingRemoveXmlInstructions);
399  this->option("repair.cdata", this->config.parsingRepairCData);
400  this->option("repair.comments", this->config.parsingRepairComments);
401  this->option("tidy.errors", this->config.parsingTidyErrors);
402  this->option("tidy.warnings", this->config.parsingTidyWarnings);
403  }
404 
406 
410  inline void Config::checkOptions() {
411  // check for target table
412  if(this->config.generalResultTable.empty()) {
413  throw Exception(
414  "Parser::Config::checkOptions():"
415  " No target table has been specified."
416  );
417  }
418 
419  // check properties of date/time queries
420  const auto completeDateTimes{
421  std::min( // number of complete date/time queries (= min. size of all arrays)
422  this->config.parsingDateTimeQueries.size(),
423  this->config.parsingDateTimeSources.size()
424  )
425  };
426 
427  bool incompleteDateTimes{false};
428 
429  // remove date/time queries or sources that are not used
430  if(this->config.parsingDateTimeQueries.size() > completeDateTimes) {
431  this->config.parsingDateTimeQueries.resize(completeDateTimes);
432 
433  incompleteDateTimes = true;
434  }
435  else if(this->config.parsingDateTimeSources.size() > completeDateTimes) {
436  this->config.parsingDateTimeSources.resize(completeDateTimes);
437 
438  incompleteDateTimes = true;
439  }
440 
441  // warn about incomplete date/time queries
442  if(incompleteDateTimes) {
443  this->warning(
444  "'datetime.queries', '.sources'"
445  " should have the same number of elements."
446  );
447 
448  this->warning(
449  "Incomplete date/time queries removed from configuration."
450  );
451 
452  incompleteDateTimes = false;
453  }
454 
455  // remove date/time formats that are not used, add empty format where none is specified
456  if(this->config.parsingDateTimeFormats.size() > completeDateTimes) {
457  incompleteDateTimes = true;
458  }
459 
460  this->config.parsingDateTimeFormats.resize(completeDateTimes);
461 
462  // replace empty date/time formats with "%F %T"
463  std::replace_if(
464  this->config.parsingDateTimeFormats.begin(),
465  this->config.parsingDateTimeFormats.end(),
466  [](const auto& str) {
467  return str.empty();
468  },
469  "%F %T"
470  );
471 
472  // remove date/time locales that are not used, add empty locale where none is specified
473  if(this->config.parsingDateTimeLocales.size() > completeDateTimes) {
474  incompleteDateTimes = true;
475  }
476 
477  this->config.parsingDateTimeLocales.resize(completeDateTimes);
478 
479  // warn about unused properties
480  if(incompleteDateTimes) {
481  this->warning("Unused date/time properties removed from configuration.");
482  }
483 
484  // check properties of parsing fields
485  const auto completeFields{
486  std::min({ // number of complete fields (= min. size of all arrays)
487  this->config.parsingFieldNames.size(),
488  this->config.parsingFieldQueries.size(),
489  this->config.parsingFieldSources.size()
490  })
491  };
492 
493  bool incompleteFields{false};
494 
495  // remove names of incomplete parsing fields
496  if(this->config.parsingFieldNames.size() > completeFields) {
497  this->config.parsingFieldNames.resize(completeFields);
498 
499  incompleteFields = true;
500  }
501 
502  // remove queries of incomplete parsing fields
503  if(this->config.parsingFieldQueries.size() > completeFields) {
504  this->config.parsingFieldQueries.resize(completeFields);
505 
506  incompleteFields = true;
507  }
508 
509  // remove sources of incomplete parsing fields
510  if(this->config.parsingFieldSources.size() > completeFields) {
511  this->config.parsingFieldSources.resize(completeFields);
512 
513  incompleteFields = true;
514  }
515 
516  // warn about incomplete parsing fields
517  if(incompleteFields) {
518  this->warning(
519  "'field.names', '.queries' and '.sources'"
520  " should have the same number of elements."
521  );
522 
523  this->warning("Incomplete field(s) removed from configuration.");
524 
525  incompleteFields = false;
526  }
527 
528  // remove date/time formats that are not used, add empty format where none is specified
529  if(this->config.parsingFieldDateTimeFormats.size() > completeFields) {
530  incompleteFields = true;
531  }
532 
533  this->config.parsingFieldDateTimeFormats.resize(completeFields);
534 
535  // remove date/time locales that are not used, add empty locale where none is specified
536  if(this->config.parsingFieldDateTimeLocales.size() > completeFields) {
537  incompleteFields = true;
538  }
539 
540  this->config.parsingFieldDateTimeLocales.resize(completeFields);
541 
542  // remove field delimiters that are not used, add empty delimiter (\0) where none is specified
543  if(this->config.parsingFieldDelimiters.size() > completeFields) {
544  incompleteFields = true;
545  }
546 
547  this->config.parsingFieldDelimiters.resize(completeFields, '\0');
548 
549  // replace all empty field delimiters with '\n'
550  std::replace_if(
551  this->config.parsingFieldDelimiters.begin(),
552  this->config.parsingFieldDelimiters.end(),
553  [](char c) {
554  return c == '\0';
555  },
556  '\n'
557  );
558 
559  // remove 'ignore empty values' properties that are not used, set to 'true' where none is specified
560  if(this->config.parsingFieldIgnoreEmpty.size() > completeFields) {
561  incompleteFields = true;
562  }
563 
564  this->config.parsingFieldIgnoreEmpty.resize(completeFields, true);
565 
566  // remove 'save field as JSON' properties that are not used, set to 'false' where none is specified
567  if(this->config.parsingFieldJSON.size() > completeFields) {
568  incompleteFields = true;
569  }
570 
571  this->config.parsingFieldJSON.resize(completeFields, false);
572 
573  // remove 'tidy text' properties that are not used, set to 'false' where none is specified
574  if(this->config.parsingFieldTidyTexts.size() > completeFields) {
575  incompleteFields = true;
576  }
577 
578  this->config.parsingFieldTidyTexts.resize(completeFields, false);
579 
580  // remove 'warning if empty' properties that are not used, set to 'false' where none is specified
581  if(this->config.parsingFieldWarningsEmpty.size() > completeFields) {
582  incompleteFields = true;
583  }
584 
585  this->config.parsingFieldWarningsEmpty.resize(completeFields, false);
586 
587  // warn about unused properties
588  if(incompleteFields) {
589  this->warning("Unused field properties removed from configuration.");
590  }
591 
592  // check properties of ID queries
593  const auto completeIds{
594  std::min( // number of complete ID queries (= min. size of all arrays)
595  this->config.parsingIdQueries.size(),
596  this->config.parsingIdSources.size()
597  )
598  };
599 
600  bool incompleteIds{false};
601 
602  // remove ID queries or sources that are not used
603  if(this->config.parsingIdQueries.size() > completeIds) {
604  this->config.parsingIdQueries.resize(completeIds);
605 
606  incompleteIds = true;
607  }
608  else if(this->config.parsingIdSources.size() > completeIds) {
609  this->config.parsingIdSources.resize(completeIds);
610 
611  incompleteIds = true;
612  }
613 
614  // warn about incomplete ID queries
615  if(incompleteIds) {
616  this->warning(
617  "'id.queries' and '.sources'"
618  " should have the same number of elements."
619  );
620 
621  this->warning("Incomplete ID queries removed from configuration.");
622  }
623  }
624 
626  inline void Config::reset() {
627  this->config = {};
628  }
629 
630 } /* namespace crawlservpp::Module::Parser */
631 
632 #endif /* MODULE_PARSER_CONFIG_HPP_ */
std::vector< char > parsingFieldDelimiters
Delimiter between multiple results for the field with the same array index, if not saved as JSON...
Definition: Config.hpp:241
std::vector< std::string > parsingFieldDateTimeLocales
Locale to be used by the query with the same array index.
Definition: Config.hpp:234
std::vector< std::uint64_t > parsingDateTimeQueries
Queries used for parsing the date/time.
Definition: Config.hpp:192
bool generalNewestOnly
Specifies whether to parse only the newest content for each URL.
Definition: Config.hpp:126
constexpr std::uint8_t generalLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:62
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
void checkOptions() override
Checks the parser-specific configuration options.
Definition: Config.hpp:410
std::vector< std::uint8_t > parsingFieldSources
Source of the field with the same array index – the URL itself, or the crawled content belonging to ...
Definition: Config.hpp:264
std::vector< std::uint64_t > parsingContentIgnoreQueries
Content matching one of these queries will be excluded from parsing.
Definition: Config.hpp:154
bool parsingRepairComments
Specifies whether to (try to) repair broken HTML/XML comments.
Definition: Config.hpp:301
std::string generalResultTable
Table name to save parsed data to.
Definition: Config.hpp:135
constexpr std::uint8_t generalLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:59
std::uint8_t generalLogging
Level of logging activity.
Definition: Config.hpp:120
bool parsingRepairCData
Specifies whether to (try to) repair CData when parsing HTML/XML.
Definition: Config.hpp:298
std::uint16_t generalMaxBatchSize
Maximum number of URLs processed in one MySQL query.
Definition: Config.hpp:123
bool generalReParse
Specifies whether to re-parse already parsed URLs.
Definition: Config.hpp:132
std::uint16_t parsingTidyErrors
Number of tidyhtml errors to write to the log.
Definition: Config.hpp:311
Class for parser configuration exceptions.
Definition: Config.hpp:329
Abstract class as base for module-specific configurations.
Definition: Config.hpp:122
constexpr std::uint8_t crawlerLoggingVerbose
Logging is disabled.
Definition: Config.hpp:53
std::vector< std::string > parsingDateTimeLocales
Locale to be used by the date/time query with the same array index.
Definition: Config.hpp:184
constexpr std::uint32_t defaultLockS
Default URL locking time, in seconds.
Definition: Config.hpp:74
Namespace for parser classes.
Definition: Config.hpp:43
void parseOption() override
Parses an parser-specific configuration option.
Definition: Config.hpp:351
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::uint32_t generalLock
URL locking time, in seconds.
Definition: Config.hpp:117
Configuration for parsers.
Definition: Config.hpp:92
std::uint64_t generalSleepMySql
Time to wait before last try to re-connect to MySQL server, in seconds.
Definition: Config.hpp:144
constexpr std::uint8_t generalLoggingDefault
Default logging is enabled.
Definition: Config.hpp:56
void reset() override
Resets the parser-specific configuration options.
Definition: Config.hpp:626
bool generalParseCustom
Specifies whether to include custom URLs when parsing.
Definition: Config.hpp:129
std::vector< std::uint16_t > parsingDateTimeSources
Where to parse the date/time from – the URL itself, or the crawled content belonging to the URL...
Definition: Config.hpp:200
std::vector< std::uint8_t > parsingIdSources
Where to parse the ID from when using the ID query with the same array index – – the URL itself...
Definition: Config.hpp:295
std::vector< std::string > parsingFieldNames
Name of the field with the same array index.
Definition: Config.hpp:253
constexpr std::uint64_t defaultSleepMySqlS
Default time to wait before last try to re-connect to MySQL server, in seconds.
Definition: Config.hpp:83
std::uint64_t generalCacheSize
Number of URLs fetched and parsed before saving results.
Definition: Config.hpp:111
constexpr std::uint8_t parsingSourceUrl
Parse data from the URL of a crawled web page.
Definition: Config.hpp:65
bool generalTiming
Specifies whether to calculate timing statistics.
Definition: Config.hpp:147
std::vector< std::string > parsingFieldDateTimeFormats
Date/time format of the field with the same array index.
Definition: Config.hpp:226
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::vector< bool > parsingFieldIgnoreEmpty
Specifies whether to ignore empty values when parsing multiple results for the field with the same ar...
Definition: Config.hpp:247
bool parsingDateTimeWarningEmpty
Specifies whether to write a warning to the log if no date/time could be parsed although a query is s...
Definition: Config.hpp:207
std::vector< std::string > parsingIdIgnore
Parsed IDs to be ignored.
Definition: Config.hpp:277
struct crawlservpp::Module::Parser::Config::Entries config
Configuration of the parser.
std::vector< bool > parsingFieldWarningsEmpty
Specifies whether to write a warning to the log if the field with the same array index is empty...
Definition: Config.hpp:274
std::vector< bool > parsingFieldJSON
Specifies whether to save the value of the field with the same array index as a JSON array...
Definition: Config.hpp:250
std::vector< std::uint64_t > generalSkip
Queries on URLs that will not be parsed.
Definition: Config.hpp:138
bool parsingTidyWarnings
Specifies whether to write tidyhtml warnings to the log.
Definition: Config.hpp:318
std::vector< std::string > parsingDateTimeFormats
Format of the date/time to be parsed by the date/time query with the same array index.
Definition: Config.hpp:175
std::vector< std::uint64_t > parsingFieldQueries
Query for the field with the same array index.
Definition: Config.hpp:256
std::vector< std::uint64_t > parsingIdQueries
Queries to parse the ID.
Definition: Config.hpp:287
constexpr std::uint8_t parsingSourceContent
Parse data from the content of a crawled web page.
Definition: Config.hpp:68
constexpr std::uint64_t defaultSleepIdleMs
Default time to wait before checking for new URLs when all URLs have been parsed, in milliseconds...
Definition: Config.hpp:80
constexpr std::uint16_t defaultMaxBatchSize
Default maximum number of URLs to be processed in one MySQL query.
Definition: Config.hpp:77
constexpr std::uint64_t defaultCacheSize
Default cache size.
Definition: Config.hpp:71
bool parsingRemoveXmlInstructions
Specifies whether to remove XML processing instructions (<?xml:...>) before parsing HTML content...
Definition: Config.hpp:304
Configuration entries for parser threads.
Definition: Config.hpp:103
std::uint64_t generalDbTimeOut
Timeout on MySQL query execution, in milliseconds.
Definition: Config.hpp:114
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
std::vector< bool > parsingFieldTidyTexts
Specifies whether to remove line breaks and unnecessary whitespaces when parsing the field with the s...
Definition: Config.hpp:267
std::uint64_t generalSleepIdle
Time to wait before checking for new URLs when all URLs have been parsed, in milliseconds.
Definition: Config.hpp:141