31 #ifndef MODULE_PARSER_CONFIG_HPP_ 32 #define MODULE_PARSER_CONFIG_HPP_ 34 #include "../../Main/Exception.hpp" 35 #include "../../Module/Config.hpp" 337 void reset()
override;
366 StringParsingOption::SQL
382 CharParsingOption::FromString
389 StringParsingOption::SQL
414 "Parser::Config::checkOptions():" 415 " No target table has been specified." 420 const auto completeDateTimes{
427 bool incompleteDateTimes{
false};
433 incompleteDateTimes =
true;
438 incompleteDateTimes =
true;
442 if(incompleteDateTimes) {
444 "'datetime.queries', '.sources'" 445 " should have the same number of elements." 449 "Incomplete date/time queries removed from configuration." 452 incompleteDateTimes =
false;
457 incompleteDateTimes =
true;
466 [](
const auto& str) {
474 incompleteDateTimes =
true;
480 if(incompleteDateTimes) {
481 this->
warning(
"Unused date/time properties removed from configuration.");
485 const auto completeFields{
493 bool incompleteFields{
false};
499 incompleteFields =
true;
506 incompleteFields =
true;
513 incompleteFields =
true;
517 if(incompleteFields) {
519 "'field.names', '.queries' and '.sources'" 520 " should have the same number of elements." 523 this->
warning(
"Incomplete field(s) removed from configuration.");
525 incompleteFields =
false;
530 incompleteFields =
true;
537 incompleteFields =
true;
544 incompleteFields =
true;
561 incompleteFields =
true;
568 incompleteFields =
true;
575 incompleteFields =
true;
582 incompleteFields =
true;
588 if(incompleteFields) {
589 this->
warning(
"Unused field properties removed from configuration.");
593 const auto completeIds{
600 bool incompleteIds{
false};
606 incompleteIds =
true;
611 incompleteIds =
true;
617 "'id.queries' and '.sources'" 618 " should have the same number of elements." 621 this->
warning(
"Incomplete ID queries removed from configuration.");
std::vector< char > parsingFieldDelimiters
Delimiter between multiple results for the field with the same array index, if not saved as JSON...
Definition: Config.hpp:241
std::vector< std::string > parsingFieldDateTimeLocales
Locale to be used by the query with the same array index.
Definition: Config.hpp:234
std::vector< std::uint64_t > parsingDateTimeQueries
Queries used for parsing the date/time.
Definition: Config.hpp:192
bool generalNewestOnly
Specifies whether to parse only the newest content for each URL.
Definition: Config.hpp:126
constexpr std::uint8_t generalLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:62
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
void checkOptions() override
Checks the parser-specific configuration options.
Definition: Config.hpp:410
std::vector< std::uint8_t > parsingFieldSources
Source of the field with the same array index – the URL itself, or the crawled content belonging to ...
Definition: Config.hpp:264
std::vector< std::uint64_t > parsingContentIgnoreQueries
Content matching one of these queries will be excluded from parsing.
Definition: Config.hpp:154
bool parsingRepairComments
Specifies whether to (try to) repair broken HTML/XML comments.
Definition: Config.hpp:301
std::string generalResultTable
Table name to save parsed data to.
Definition: Config.hpp:135
constexpr std::uint8_t generalLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:59
std::uint8_t generalLogging
Level of logging activity.
Definition: Config.hpp:120
bool parsingRepairCData
Specifies whether to (try to) repair CData when parsing HTML/XML.
Definition: Config.hpp:298
std::uint16_t generalMaxBatchSize
Maximum number of URLs processed in one MySQL query.
Definition: Config.hpp:123
bool generalReParse
Specifies whether to re-parse already parsed URLs.
Definition: Config.hpp:132
std::uint16_t parsingTidyErrors
Number of tidyhtml errors to write to the log.
Definition: Config.hpp:311
Class for parser configuration exceptions.
Definition: Config.hpp:329
Abstract class as base for module-specific configurations.
Definition: Config.hpp:122
constexpr std::uint8_t crawlerLoggingVerbose
Logging is disabled.
Definition: Config.hpp:53
std::vector< std::string > parsingDateTimeLocales
Locale to be used by the date/time query with the same array index.
Definition: Config.hpp:184
constexpr std::uint32_t defaultLockS
Default URL locking time, in seconds.
Definition: Config.hpp:74
Namespace for parser classes.
Definition: Config.hpp:43
void parseOption() override
Parses an parser-specific configuration option.
Definition: Config.hpp:351
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::uint32_t generalLock
URL locking time, in seconds.
Definition: Config.hpp:117
Configuration for parsers.
Definition: Config.hpp:92
std::uint64_t generalSleepMySql
Time to wait before last try to re-connect to MySQL server, in seconds.
Definition: Config.hpp:144
constexpr std::uint8_t generalLoggingDefault
Default logging is enabled.
Definition: Config.hpp:56
void reset() override
Resets the parser-specific configuration options.
Definition: Config.hpp:626
bool generalParseCustom
Specifies whether to include custom URLs when parsing.
Definition: Config.hpp:129
std::vector< std::uint16_t > parsingDateTimeSources
Where to parse the date/time from – the URL itself, or the crawled content belonging to the URL...
Definition: Config.hpp:200
std::vector< std::uint8_t > parsingIdSources
Where to parse the ID from when using the ID query with the same array index – – the URL itself...
Definition: Config.hpp:295
std::vector< std::string > parsingFieldNames
Name of the field with the same array index.
Definition: Config.hpp:253
constexpr std::uint64_t defaultSleepMySqlS
Default time to wait before last try to re-connect to MySQL server, in seconds.
Definition: Config.hpp:83
std::uint64_t generalCacheSize
Number of URLs fetched and parsed before saving results.
Definition: Config.hpp:111
constexpr std::uint8_t parsingSourceUrl
Parse data from the URL of a crawled web page.
Definition: Config.hpp:65
bool generalTiming
Specifies whether to calculate timing statistics.
Definition: Config.hpp:147
std::vector< std::string > parsingFieldDateTimeFormats
Date/time format of the field with the same array index.
Definition: Config.hpp:226
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::vector< bool > parsingFieldIgnoreEmpty
Specifies whether to ignore empty values when parsing multiple results for the field with the same ar...
Definition: Config.hpp:247
bool parsingDateTimeWarningEmpty
Specifies whether to write a warning to the log if no date/time could be parsed although a query is s...
Definition: Config.hpp:207
std::vector< std::string > parsingIdIgnore
Parsed IDs to be ignored.
Definition: Config.hpp:277
struct crawlservpp::Module::Parser::Config::Entries config
Configuration of the parser.
std::vector< bool > parsingFieldWarningsEmpty
Specifies whether to write a warning to the log if the field with the same array index is empty...
Definition: Config.hpp:274
std::vector< bool > parsingFieldJSON
Specifies whether to save the value of the field with the same array index as a JSON array...
Definition: Config.hpp:250
std::vector< std::uint64_t > generalSkip
Queries on URLs that will not be parsed.
Definition: Config.hpp:138
bool parsingTidyWarnings
Specifies whether to write tidyhtml warnings to the log.
Definition: Config.hpp:318
std::vector< std::string > parsingDateTimeFormats
Format of the date/time to be parsed by the date/time query with the same array index.
Definition: Config.hpp:175
std::vector< std::uint64_t > parsingFieldQueries
Query for the field with the same array index.
Definition: Config.hpp:256
std::vector< std::uint64_t > parsingIdQueries
Queries to parse the ID.
Definition: Config.hpp:287
constexpr std::uint8_t parsingSourceContent
Parse data from the content of a crawled web page.
Definition: Config.hpp:68
constexpr std::uint64_t defaultSleepIdleMs
Default time to wait before checking for new URLs when all URLs have been parsed, in milliseconds...
Definition: Config.hpp:80
constexpr std::uint16_t defaultMaxBatchSize
Default maximum number of URLs to be processed in one MySQL query.
Definition: Config.hpp:77
constexpr std::uint64_t defaultCacheSize
Default cache size.
Definition: Config.hpp:71
bool parsingRemoveXmlInstructions
Specifies whether to remove XML processing instructions (<?xml:...>) before parsing HTML content...
Definition: Config.hpp:304
Configuration entries for parser threads.
Definition: Config.hpp:103
std::uint64_t generalDbTimeOut
Timeout on MySQL query execution, in milliseconds.
Definition: Config.hpp:114
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
std::vector< bool > parsingFieldTidyTexts
Specifies whether to remove line breaks and unnecessary whitespaces when parsing the field with the s...
Definition: Config.hpp:267
std::uint64_t generalSleepIdle
Time to wait before checking for new URLs when all URLs have been parsed, in milliseconds.
Definition: Config.hpp:141