31 #ifndef MODULE_CRAWLER_CONFIG_HPP_ 32 #define MODULE_CRAWLER_CONFIG_HPP_ 34 #include "../../Main/Exception.hpp" 35 #include "../../Network/Config.hpp" 529 void reset()
override;
534 bool crossDomain{
false};
551 this->crossDomain = isCrossDomain;
566 "archives.urls.memento",
568 StringParsingOption::Trim
610 StringParsingOption::URL : StringParsingOption::SubURL
647 StringParsingOption::URL : StringParsingOption::SubURL
680 "Crawler::Config::checkOptions():" 681 " No link extraction query has been specified" 690 "Invalid value for 'url.chunks' ignored (was zero)," 696 bool incompleteArchives{
false};
698 const auto completeArchives{
710 incompleteArchives =
true;
717 incompleteArchives =
true;
724 incompleteArchives =
true;
728 if(incompleteArchives) {
730 "'archives.names', '.urls.memento' and '.urls.timemap'" 731 " should have the same number of elements." 734 this->
warning(
"Incomplete archive(s) removed from configuration.");
738 bool incompleteCounters{
false};
740 const auto completeCounters{
753 incompleteCounters =
true;
760 incompleteCounters =
true;
767 incompleteCounters =
true;
771 if(incompleteCounters) {
773 "'custom.counters', '.counters.start'," 774 " '.counters.end' and '.counters.step'" 775 " should have the same number of elements." 778 this->
warning(
"Incomplete counter(s) removed from configuration.");
780 incompleteCounters =
false;
785 incompleteCounters =
true;
792 incompleteCounters =
true;
799 incompleteCounters =
true;
805 if(incompleteCounters) {
806 this->
warning(
"Unused counter properties removed from configuration.");
812 const auto index{n - 1};
842 +
"' would be infinite, counter removed." 848 bool incompleteTokens{
false};
850 const auto completeTokens{
862 incompleteTokens =
true;
869 incompleteTokens =
true;
876 incompleteTokens =
true;
880 if(incompleteTokens) {
882 "'custom.tokens', '.tokens.source' and '.tokens.query'" 883 " should have the same number of elements." 886 this->
warning(
"Incomplete token(s) removed from configuration.");
888 incompleteTokens =
false;
893 incompleteTokens =
true;
900 incompleteTokens =
true;
907 incompleteTokens =
true;
914 incompleteTokens =
true;
920 if(incompleteTokens) {
921 this->
warning(
"Unused token properties removed from configuration.");
925 bool incompleteVars{
false};
927 const auto completeVars{
939 incompleteVars =
true;
946 incompleteVars =
true;
953 incompleteVars =
true;
959 "'redirect.var.names', '.var.sources' and '.var.queries'" 960 " should have the same number of elements." 963 this->
warning(
"Incomplete variable(s) removed form configuration.");
bool customCountersGlobal
Specifies whether to use every counter for all custom URLs.
Definition: Config.hpp:408
std::vector< std::uint64_t > crawlerQueriesWhiteListTypes
If not empty, only content types matching one of these queries will be crawled.
Definition: Config.hpp:287
std::uint64_t crawlerSleepError
Sleeping time on connection errors, in milliseconds.
Definition: Config.hpp:331
std::vector< std::int64_t > customCountersStart
Start value for the counter with the same array index.
Definition: Config.hpp:411
bool crawlerUrlStartupCheck
Specifies whether to check the URL list before starting to crawl.
Definition: Config.hpp:380
std::vector< std::string > crawlerReCrawlAlways
List of URLs that will always be re-crawled.
Definition: Config.hpp:301
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
Configuration entries for crawler threads.
Definition: Config.hpp:130
std::vector< std::uint64_t > crawlerQueriesLinksWhiteListTypes
If not empty, only content types matching one of these queries will be used for link extraction...
Definition: Config.hpp:263
std::vector< std::string > customTokenHeaders
Custom HTTP headers to be used for ALL tokens.
Definition: Config.hpp:423
std::vector< std::uint64_t > customTokensQuery
Query to extract the token with the same array index.
Definition: Config.hpp:435
constexpr std::uint16_t defaultUrlMaxLength
Default maximum length of URLs to add.
Definition: Config.hpp:102
std::vector< bool > customTokensRequired
Determines whether an error occurs if the token with the same array index is empty.
Definition: Config.hpp:438
std::vector< std::string > crawlerParamsAdd
URL parameters that will be added shortly before retrieving the content.
Definition: Config.hpp:182
std::vector< std::string > crawlerParamsBlackList
Parameters in URLs that will be ignored.
Definition: Config.hpp:189
bool crawlerUrlDebug
Specifies whether to perform additional check for duplicates after URL insertion. ...
Definition: Config.hpp:374
std::vector< std::uint64_t > crawlerQueriesLinks
Queries on content to find URLs.
Definition: Config.hpp:223
Class for crawler configuration exceptions.
Definition: Config.hpp:521
std::vector< std::string > crawlerArchivesUrlsMemento
Memento URI templates for archives to crawl.
Definition: Config.hpp:152
std::vector< std::string > crawlerArchivesUrlsSkip
Memento URIs that will always be skipped.
Definition: Config.hpp:155
constexpr std::uint32_t defaultCrawlerLockS
Default time to lock URLs that are being processed, in seconds.
Definition: Config.hpp:72
std::vector< std::string > crawlerParamsWhiteList
Parameters in URLs that will not be ignored.
Definition: Config.hpp:196
std::vector< std::uint64_t > crawlerQueriesLinksWhiteListContent
If not empty, only content matching one of these queries will be used for link extraction.
Definition: Config.hpp:255
bool crawlerArchivesOnly
Specified whether to crawl archived pages ONLY.
Definition: Config.hpp:145
bool customRobots
Specifies whether to add the sitemaps specified in robots.txt as custom URLs.
Definition: Config.hpp:420
std::vector< std::uint64_t > crawlerQueriesBlackListTypes
Content types matching one of these queries will not be crawled.
Definition: Config.hpp:212
std::vector< std::string > customTokensCookies
Custom HTTP Cookie header for the token with the same array index.
Definition: Config.hpp:429
Abstract class containing the network-specific configuration for threads.
Definition: Config.hpp:121
std::uint32_t crawlerTidyErrors
Number of tidyhtml errors to log (if logging is enabled).
Definition: Config.hpp:353
std::uint64_t redirectQueryUrl
Query on URL that specifies whether to redirect to different URL.
Definition: Config.hpp:485
std::uint8_t crawlerLogging
Level of logging acivitiy.
Definition: Config.hpp:172
std::vector< bool > customTokensUsePost
Use HTTP POST instead of GET for the token with the same array index.
Definition: Config.hpp:444
bool crawlerStartIgnore
Specifies whether to not crawl the start page.
Definition: Config.hpp:350
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr std::array defaultRetryHttp
HTTP errors that will be handled like connection errors by default.
Definition: Config.hpp:84
Namespace for crawler classes.
Definition: Config.hpp:44
constexpr std::uint8_t crawlerLoggingSilent
Logging is disabled.
Definition: Config.hpp:54
std::vector< std::string > customTokens
List of token variables to be replaced in custom URLs.
Definition: Config.hpp:426
void reset() override
Resets the crawler-specific configuration options.
Definition: Config.hpp:968
std::uint64_t crawlerSleepMySql
Time to wait before trying to re-connect to the MySQL server, in seconds.
Definition: Config.hpp:344
bool crawlerRepairCData
Specifies whether to (try to) repair CData when parsing HTML/XML.
Definition: Config.hpp:307
std::vector< std::string > crawlerArchivesUrlsTimemap
Timemap URI template for archives to crawl.
Definition: Config.hpp:162
constexpr std::uint64_t defaultUrlChunks
Default number of crawled URLs to be processed at once without possible interruption.
Definition: Config.hpp:99
std::vector< std::uint64_t > crawlerQueriesWhiteListContent
If not empty, only content matching one of these queries will be crawled.
Definition: Config.hpp:279
std::string redirectTo
Sub-URL (for cross-domain websites: URL without protocol) to redirect to.
Definition: Config.hpp:488
std::vector< std::uint64_t > crawlerQueriesBlackListUrls
URLs matching one of these queries will not be crawled.
Definition: Config.hpp:220
bool crawlerRemoveXmlInstructions
Specifies whether to remove XML processing instructions (<?xml:...>) before parsing HTML content...
Definition: Config.hpp:313
std::vector< std::string > customUrls
Custom URLs for crawling (should all start with / except for cross-domain websites).
Definition: Config.hpp:447
std::vector< std::int64_t > customCountersStep
Step value for the counter with the same array index.
Definition: Config.hpp:414
constexpr std::uint8_t crawlerLoggingDefault
Default logging is enabled.
Definition: Config.hpp:57
constexpr std::uint8_t redirectSourceContent
Performing a query on the content of a crawled web page to determine whether to redirect.
Definition: Config.hpp:69
std::vector< std::uint8_t > redirectVarSources
Source type of the variable with the same index.
Definition: Config.hpp:510
bool crawlerRetryArchive
Specifies whether to re-try when retrieving the archived pages fails.
Definition: Config.hpp:322
bool redirectUsePost
Specifies whether to use HTTP POST instead of HTTP GET to retrieve a URL after redirect.
Definition: Config.hpp:491
constexpr std::int32_t defaultRestartAfter
Default time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactiva...
Definition: Config.hpp:78
std::uint16_t crawlerUrlMaxLength
Maximum length of URLs to add.
Definition: Config.hpp:377
std::vector< std::uint64_t > crawlerQueriesLinksBlackListUrls
URLs matching one of these queries will not be used for link extraction.
Definition: Config.hpp:247
void parseOption() override
Parses an crawler-specific configuration option.
Definition: Config.hpp:559
bool crawlerUrlCaseSensitive
Specifies whether URLs are case-sensitive.
Definition: Config.hpp:365
std::string crawlerStart
Starting point for crawling (should start with / except for cross-domain websites).
Definition: Config.hpp:347
std::uint64_t redirectQueryContent
Query on content that specifies whether to redirect to different URL.
Definition: Config.hpp:482
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::uint64_t crawlerSleepIdle
Time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds.
Definition: Config.hpp:337
std::vector< std::uint64_t > crawlerQueriesLinksBlackListContent
Content matching one of these queries will not be used for link extraction.
Definition: Config.hpp:231
std::vector< std::uint64_t > crawlerQueriesLinksBlackListTypes
Content types matching one of these queries will not be used for link extraction. ...
Definition: Config.hpp:239
std::vector< std::uint32_t > customTokensKeep
Time until the token with the same array index gets invalid, in seconds.
Definition: Config.hpp:432
std::uint32_t crawlerLock
Time for which to lock URLs that are currently being processed, in seconds.
Definition: Config.hpp:165
std::uint64_t crawlerSleepHttp
Time that will be waited between HTTP requests, in milliseconds.
Definition: Config.hpp:334
bool crawlerTidyWarnings
Specifies whether to log tidyhtml warnings (if logging is enabled).
Definition: Config.hpp:356
std::string redirectCookies
Custom HTTP Cookie header on dynamic redirect.
Definition: Config.hpp:476
std::uint64_t crawlerUrlChunks
Number of crawled URLs to be processed at once without possible interruption.
Definition: Config.hpp:368
constexpr std::uint64_t defaultSleepErrorMs
Default sleeping time on connection errors, in milliseconds.
Definition: Config.hpp:87
bool crawlerTiming
Specifies whether to calculate timing statistics for the crawler.
Definition: Config.hpp:359
std::vector< std::string > customCountersAlias
Alias for the counter variable with the same array index.
Definition: Config.hpp:396
std::vector< std::uint64_t > redirectVarQueries
Query on variable source to retrieve the value of the variable with the same index.
Definition: Config.hpp:503
bool customReCrawl
Specifies whether to always re-crawl custom URLs.
Definition: Config.hpp:417
constexpr std::int64_t defaultReTries
Default number of re-tries on connection errors.
Definition: Config.hpp:81
std::vector< std::uint64_t > crawlerQueriesBlackListContent
Content matching one of these queries will not be crawled.
Definition: Config.hpp:204
bool crawlerXml
Specifies whether to always save crawled content as cleaned XML.
Definition: Config.hpp:386
constexpr std::uint8_t redirectSourceUrl
Performing a query on the URL of a crawled web page to determine whether to redirect.
Definition: Config.hpp:66
bool crawlerArchives
Specifies whether to crawl archived pages.
Definition: Config.hpp:135
std::vector< std::uint32_t > crawlerRetryHttp
HTTP errors that will be handled like connection errors.
Definition: Config.hpp:328
constexpr std::uint8_t crawlerLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:63
struct crawlservpp::Module::Crawler::Config::Entries config
Configuration of the crawler.
constexpr std::uint64_t defaultSleepHttpMs
Default time that will be waited between HTTP requests, in milliseconds.
Definition: Config.hpp:90
bool expectedErrorIfSmaller
Specifies whether to throw an exception when number of expected URLs is subceeded.
Definition: Config.hpp:466
bool crawlerReCrawlStart
Specifies whether to re-crawl the start page every time to keep the URL list up-to-date.
Definition: Config.hpp:304
std::vector< std::string > redirectHeaders
Custom HTTP headers on dynamic redirect.
Definition: Config.hpp:479
bool customUsePost
Specifies whether to use HTTP POST instead of HTTP GET to retrieve custom URLs.
Definition: Config.hpp:455
bool crawlerRetryEmpty
Specifies whether empty responses will be handled like connection errors.
Definition: Config.hpp:325
bool expectedErrorIfLarger
Specifies whether to throw an exception when number of expected URLs is exceeded. ...
Definition: Config.hpp:463
std::uint64_t expectedQuery
Query to be performed on content to retrieve the expected number of URLs.
Definition: Config.hpp:469
bool crawlerRepairComments
Specifies whether to (try to) repair broken HTML/XML comments.
Definition: Config.hpp:310
bool crawlerReCrawl
Specifies whether to re-crawl already crawled URLs.
Definition: Config.hpp:298
std::vector< std::string > redirectVarNames
Variable names to be replaced on redirect.
Definition: Config.hpp:500
void setCrossDomain(bool isCrossDomain)
Sets whether the corresponding website is cross-domain.
Definition: Config.hpp:550
std::vector< std::int64_t > customCountersEnd
End value for the counter with the same array index.
Definition: Config.hpp:402
std::vector< std::uint64_t > crawlerQueriesLinksWhiteListUrls
If not empty, only URLs matching one of these queries will be used for link extraction.
Definition: Config.hpp:271
std::vector< std::uint64_t > customCountersAliasAdd
Value to add to the counter alias with the same array index.
Definition: Config.hpp:399
Configuration for crawlers.
Definition: Config.hpp:111
void checkOptions() override
Checks the crawler-specific configuration options.
Definition: Config.hpp:676
constexpr std::uint16_t defaultMaxBatchSize
Default number of URLs to be processed in one MySQL query.
Definition: Config.hpp:75
std::int64_t crawlerReTries
Number of re-tries on connection errors (-1=infinite).
Definition: Config.hpp:319
bool crawlerWarningsFile
Specifies whether to warn when files are found (as opposed to folders).
Definition: Config.hpp:383
std::uint16_t crawlerMaxBatchSize
Maximum number of URLs processed in one MySQL query.
Definition: Config.hpp:175
std::vector< std::string > customTokensSource
Source URL for the token with the same array index (absolute link without protocol).
Definition: Config.hpp:441
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
std::vector< std::string > customCounters
List of counter variables to be replaced in custom URLs.
Definition: Config.hpp:393
constexpr std::uint64_t defaultSleepIdleMs
Default time that will be waited before checking for new URLs when all URLs have been crawled...
Definition: Config.hpp:93
std::vector< std::uint64_t > crawlerQueriesWhiteListUrls
If not empty, only URLs matching one of these queries will be crawled.
Definition: Config.hpp:295
std::int32_t crawlerRestartAfter
Time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated).
Definition: Config.hpp:316
constexpr std::uint64_t defaultSleepMySqlS
Default time to wait before the first try to re-connect to the MySQL server, in seconds.
Definition: Config.hpp:96
constexpr std::uint8_t crawlerLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:60
std::vector< std::string > crawlerArchivesNames
Names of archives to crawl.
Definition: Config.hpp:138