Configuration entries for crawler threads. More...

#include <Config.hpp>

Crawler Configuration
bool	crawlerArchives {false}
	Specifies whether to crawl archived pages. More...

std::vector< std::string >	crawlerArchivesNames {"archives.org"}
	Names of archives to crawl. More...

bool	crawlerArchivesOnly {false}
	Specified whether to crawl archived pages ONLY. More...

std::vector< std::string >	crawlerArchivesUrlsMemento {"http://web.archive.org/web/"}
	Memento URI templates for archives to crawl. More...

std::vector< std::string >	crawlerArchivesUrlsSkip
	Memento URIs that will always be skipped. More...

std::vector< std::string >	crawlerArchivesUrlsTimemap {"http://web.archive.org/web/timemap/link/"}
	Timemap URI template for archives to crawl. More...

std::uint32_t	crawlerLock {defaultCrawlerLockS}
	Time for which to lock URLs that are currently being processed, in seconds. More...

std::uint8_t	crawlerLogging {crawlerLoggingDefault}
	Level of logging acivitiy. More...

std::uint16_t	crawlerMaxBatchSize {defaultMaxBatchSize}
	Maximum number of URLs processed in one MySQL query. More...

std::vector< std::string >	crawlerParamsAdd
	URL parameters that will be added shortly before retrieving the content. More...

std::vector< std::string >	crawlerParamsBlackList
	Parameters in URLs that will be ignored. More...

std::vector< std::string >	crawlerParamsWhiteList
	Parameters in URLs that will not be ignored. More...

std::vector< std::uint64_t >	crawlerQueriesBlackListContent
	Content matching one of these queries will not be crawled. More...

std::vector< std::uint64_t >	crawlerQueriesBlackListTypes
	Content types matching one of these queries will not be crawled. More...

std::vector< std::uint64_t >	crawlerQueriesBlackListUrls
	URLs matching one of these queries will not be crawled. More...

std::vector< std::uint64_t >	crawlerQueriesLinks
	Queries on content to find URLs. More...

std::vector< std::uint64_t >	crawlerQueriesLinksBlackListContent
	Content matching one of these queries will not be used for link extraction. More...

std::vector< std::uint64_t >	crawlerQueriesLinksBlackListTypes
	Content types matching one of these queries will not be used for link extraction. More...

std::vector< std::uint64_t >	crawlerQueriesLinksBlackListUrls
	URLs matching one of these queries will not be used for link extraction. More...

std::vector< std::uint64_t >	crawlerQueriesLinksWhiteListContent
	If not empty, only content matching one of these queries will be used for link extraction. More...

std::vector< std::uint64_t >	crawlerQueriesLinksWhiteListTypes
	If not empty, only content types matching one of these queries will be used for link extraction. More...

std::vector< std::uint64_t >	crawlerQueriesLinksWhiteListUrls
	If not empty, only URLs matching one of these queries will be used for link extraction. More...

std::vector< std::uint64_t >	crawlerQueriesWhiteListContent
	If not empty, only content matching one of these queries will be crawled. More...

std::vector< std::uint64_t >	crawlerQueriesWhiteListTypes
	If not empty, only content types matching one of these queries will be crawled. More...

std::vector< std::uint64_t >	crawlerQueriesWhiteListUrls
	If not empty, only URLs matching one of these queries will be crawled. More...

bool	crawlerReCrawl {false}
	Specifies whether to re-crawl already crawled URLs. More...

std::vector< std::string >	crawlerReCrawlAlways
	List of URLs that will always be re-crawled. More...

bool	crawlerReCrawlStart {true}
	Specifies whether to re-crawl the start page every time to keep the URL list up-to-date. More...

bool	crawlerRepairCData {true}
	Specifies whether to (try to) repair CData when parsing HTML/XML. More...

bool	crawlerRepairComments {true}
	Specifies whether to (try to) repair broken HTML/XML comments. More...

bool	crawlerRemoveXmlInstructions {true}
	Specifies whether to remove XML processing instructions (`<`?xml:...>) before parsing HTML content. More...

std::int32_t	crawlerRestartAfter {defaultRestartAfter}
	Time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated). More...

std::int64_t	crawlerReTries {defaultReTries}
	Number of re-tries on connection errors (-1=infinite). More...

bool	crawlerRetryArchive {true}
	Specifies whether to re-try when retrieving the archived pages fails. More...

bool	crawlerRetryEmpty {true}
	Specifies whether empty responses will be handled like connection errors. More...

std::vector< std::uint32_t >	crawlerRetryHttp {defaultRetryHttp.cbegin(), defaultRetryHttp.cend()}
	HTTP errors that will be handled like connection errors. More...

std::uint64_t	crawlerSleepError {defaultSleepErrorMs}
	Sleeping time on connection errors, in milliseconds. More...

std::uint64_t	crawlerSleepHttp {defaultSleepHttpMs}
	Time that will be waited between HTTP requests, in milliseconds. More...

std::uint64_t	crawlerSleepIdle {defaultSleepIdleMs}
	Time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds. More...

std::uint64_t	crawlerSleepMySql {defaultSleepMySqlS}
	Time to wait before trying to re-connect to the MySQL server, in seconds. More...

std::string	crawlerStart {"/"}
	Starting point for crawling (should start with / except for cross-domain websites). More...

bool	crawlerStartIgnore {false}
	Specifies whether to not crawl the start page. More...

std::uint32_t	crawlerTidyErrors {}
	Number of `tidyhtml` errors to log (if logging is enabled). More...

bool	crawlerTidyWarnings {false}
	Specifies whether to log `tidyhtml` warnings (if logging is enabled). More...

bool	crawlerTiming {false}
	Specifies whether to calculate timing statistics for the crawler. More...

bool	crawlerUrlCaseSensitive {true}
	Specifies whether URLs are case-sensitive. More...

std::uint64_t	crawlerUrlChunks {defaultUrlChunks}
	Number of crawled URLs to be processed at once without possible interruption. More...

bool	crawlerUrlDebug {false}
	Specifies whether to perform additional check for duplicates after URL insertion. More...

std::uint16_t	crawlerUrlMaxLength {defaultUrlMaxLength}
	Maximum length of URLs to add. More...

bool	crawlerUrlStartupCheck {true}
	Specifies whether to check the URL list before starting to crawl. More...

bool	crawlerWarningsFile {false}
	Specifies whether to warn when files are found (as opposed to folders). More...

bool	crawlerXml {false}
	Specifies whether to always save crawled content as cleaned XML. More...

Custom URLs
std::vector< std::string >	customCounters
	List of counter variables to be replaced in custom URLs. More...

std::vector< std::string >	customCountersAlias
	Alias for the counter variable with the same array index. More...

std::vector< std::uint64_t >	customCountersAliasAdd
	Value to add to the counter alias with the same array index. More...

std::vector< std::int64_t >	customCountersEnd
	End value for the counter with the same array index. More...

bool	customCountersGlobal {true}
	Specifies whether to use every counter for all custom URLs. More...

std::vector< std::int64_t >	customCountersStart
	Start value for the counter with the same array index. More...

std::vector< std::int64_t >	customCountersStep
	Step value for the counter with the same array index. More...

bool	customReCrawl {true}
	Specifies whether to always re-crawl custom URLs. More...

bool	customRobots {false}
	Specifies whether to add the sitemaps specified in `robots.txt` as custom URLs. More...

std::vector< std::string >	customTokenHeaders
	Custom HTTP headers to be used for ALL tokens. More...

std::vector< std::string >	customTokens
	List of token variables to be replaced in custom URLs. More...

std::vector< std::string >	customTokensCookies
	Custom HTTP `Cookie` header for the token with the same array index. More...

std::vector< std::uint32_t >	customTokensKeep
	Time until the token with the same array index gets invalid, in seconds. More...

std::vector< std::uint64_t >	customTokensQuery
	Query to extract the token with the same array index. More...

std::vector< bool >	customTokensRequired
	Determines whether an error occurs if the token with the same array index is empty. More...

std::vector< std::string >	customTokensSource
	Source URL for the token with the same array index (absolute link without protocol). More...

std::vector< bool >	customTokensUsePost
	Use HTTP POST instead of GET for the token with the same array index. More...

std::vector< std::string >	customUrls
	Custom URLs for crawling (should all start with `/` except for cross-domain websites). More...

bool	customUsePost {false}
	Specifies whether to use HTTP POST instead of HTTP GET to retrieve custom URLs. More...

Expected Number of Results
bool	expectedErrorIfLarger {false}
	Specifies whether to throw an exception when number of expected URLs is exceeded. More...

bool	expectedErrorIfSmaller {false}
	Specifies whether to throw an exception when number of expected URLs is subceeded. More...

std::uint64_t	expectedQuery {}
	Query to be performed on content to retrieve the expected number of URLs. More...

Dynamic Redirect
std::string	redirectCookies
	Custom HTTP `Cookie` header on dynamic redirect. More...

std::vector< std::string >	redirectHeaders
	Custom HTTP headers on dynamic redirect. More...

std::uint64_t	redirectQueryContent {}
	Query on content that specifies whether to redirect to different URL. More...

std::uint64_t	redirectQueryUrl {}
	Query on URL that specifies whether to redirect to different URL. More...

std::string	redirectTo
	Sub-URL (for cross-domain websites: URL without protocol) to redirect to. More...

bool	redirectUsePost {false}
	Specifies whether to use HTTP POST instead of HTTP GET to retrieve a URL after redirect. More...

std::vector< std::string >	redirectVarNames
	Variable names to be replaced on redirect. More...

std::vector< std::uint64_t >	redirectVarQueries
	Query on variable source to retrieve the value of the variable with the same index. More...

std::vector< std::uint8_t >	redirectVarSources
	Source type of the variable with the same index. More...

Detailed Description

Configuration entries for crawler threads.

Warning: Changing the configuration requires updating json/crawler.json in crawlserv_frontend!

Member Data Documentation

◆ crawlerArchives

bool crawlservpp::Module::Crawler::Config::Entries::crawlerArchives {false}

Specifies whether to crawl archived pages.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerArchivesNames

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerArchivesNames {"archives.org"}

Names of archives to crawl.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerArchivesOnly

bool crawlservpp::Module::Crawler::Config::Entries::crawlerArchivesOnly {false}

Specified whether to crawl archived pages ONLY.

Note: Will be ignored if crawling archived pages is disabled.

Referenced by crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerArchivesUrlsMemento

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerArchivesUrlsMemento {"http://web.archive.org/web/"}

Memento URI templates for archives to crawl.

To be followed by YYYYMMDDHHMMSS/URI (date and URI) of the memento to crawl.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerArchivesUrlsSkip

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerArchivesUrlsSkip

Memento URIs that will always be skipped.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerArchivesUrlsTimemap

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerArchivesUrlsTimemap {"http://web.archive.org/web/timemap/link/"}

Timemap URI template for archives to crawl.

To be followed by the URI of the page to crawl.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerLock

std::uint32_t crawlservpp::Module::Crawler::Config::Entries::crawlerLock {defaultCrawlerLockS}

Time for which to lock URLs that are currently being processed, in seconds.

Referenced by crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerLogging

std::uint8_t crawlservpp::Module::Crawler::Config::Entries::crawlerLogging {crawlerLoggingDefault}

Level of logging acivitiy.

See also: crawlerLoggingSilent, crawlerLoggingDefault crawlerLoggingExtended, crawlerLoggingVerbose

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerMaxBatchSize

std::uint16_t crawlservpp::Module::Crawler::Config::Entries::crawlerMaxBatchSize {defaultMaxBatchSize}

Maximum number of URLs processed in one MySQL query.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerParamsAdd

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerParamsAdd

URL parameters that will be added shortly before retrieving the content.

Note: These parameters will not be saved in the URL list, i.e. in the database.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerParamsBlackList

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerParamsBlackList

Parameters in URLs that will be ignored.

Note: This option will be ignored, if Entries::crawlerParamsWhiteList is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerParamsWhiteList

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerParamsWhiteList

Parameters in URLs that will not be ignored.

Note: If this option is used, Entries::crawlerParamsAdd will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesBlackListContent

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesBlackListContent

Content matching one of these queries will not be crawled.

Note: This option will be ignored, if Entries::crawlerQueriesLinksWhiteListContent is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesBlackListTypes

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesBlackListTypes

Content types matching one of these queries will not be crawled.

Note: This option will be ignored, if Entries::crawlerQueriesLinksWhiteListTypes is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesBlackListUrls

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesBlackListUrls

URLs matching one of these queries will not be crawled.

Note: This option will be ignored, if Entries::crawlerQueriesLinksWhiteListUrls is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinks

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinks

Queries on content to find URLs.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinksBlackListContent

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinksBlackListContent

Content matching one of these queries will not be used for link extraction.

Note: This option will be ignored, if Entries::crawlerQueriesLinksWhiteListContent is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinksBlackListTypes

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinksBlackListTypes

Content types matching one of these queries will not be used for link extraction.

Note: This option will be ignored, if Entries::crawlerQueriesLinksWhiteListTypes is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinksBlackListUrls

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinksBlackListUrls

URLs matching one of these queries will not be used for link extraction.

Note: This option will be ignored, if Entries::crawlerQueriesLinksWhiteListUrls is used.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinksWhiteListContent

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinksWhiteListContent

If not empty, only content matching one of these queries will be used for link extraction.

Note: If this option is used, Entries::crawlerQueriesLinksBlackListContent will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinksWhiteListTypes

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinksWhiteListTypes

If not empty, only content types matching one of these queries will be used for link extraction.

Note: If this option is used, Entries::crawlerQueriesLinksBlackListTypes will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesLinksWhiteListUrls

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesLinksWhiteListUrls

If not empty, only URLs matching one of these queries will be used for link extraction.

Note: If this option is used, Entries::crawlerQueriesLinksBlackListUrls will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesWhiteListContent

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesWhiteListContent

If not empty, only content matching one of these queries will be crawled.

Note: If this option is used, Entries::crawlerQueriesBlackListContent will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesWhiteListTypes

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesWhiteListTypes

If not empty, only content types matching one of these queries will be crawled.

Note: If this option is used, Entries::crawlerQueriesBlackListTypes will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerQueriesWhiteListUrls

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::crawlerQueriesWhiteListUrls

If not empty, only URLs matching one of these queries will be crawled.

Note: If this option is used, Entries::crawlerQueriesBlackListUrls will be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerReCrawl

bool crawlservpp::Module::Crawler::Config::Entries::crawlerReCrawl {false}

Specifies whether to re-crawl already crawled URLs.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerReCrawlAlways

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::crawlerReCrawlAlways

List of URLs that will always be re-crawled.

Referenced by crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerReCrawlStart

bool crawlservpp::Module::Crawler::Config::Entries::crawlerReCrawlStart {true}

Specifies whether to re-crawl the start page every time to keep the URL list up-to-date.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRemoveXmlInstructions

bool crawlservpp::Module::Crawler::Config::Entries::crawlerRemoveXmlInstructions {true}

Specifies whether to remove XML processing instructions (<?xml:...>) before parsing HTML content.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRepairCData

bool crawlservpp::Module::Crawler::Config::Entries::crawlerRepairCData {true}

Specifies whether to (try to) repair CData when parsing HTML/XML.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRepairComments

bool crawlservpp::Module::Crawler::Config::Entries::crawlerRepairComments {true}

Specifies whether to (try to) repair broken HTML/XML comments.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRestartAfter

std::int32_t crawlservpp::Module::Crawler::Config::Entries::crawlerRestartAfter {defaultRestartAfter}

Time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated).

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerReTries

std::int64_t crawlservpp::Module::Crawler::Config::Entries::crawlerReTries {defaultReTries}

Number of re-tries on connection errors (-1=infinite).

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRetryArchive

bool crawlservpp::Module::Crawler::Config::Entries::crawlerRetryArchive {true}

Specifies whether to re-try when retrieving the archived pages fails.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRetryEmpty

bool crawlservpp::Module::Crawler::Config::Entries::crawlerRetryEmpty {true}

Specifies whether empty responses will be handled like connection errors.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerRetryHttp

std::vector<std::uint32_t> crawlservpp::Module::Crawler::Config::Entries::crawlerRetryHttp {defaultRetryHttp.cbegin(), defaultRetryHttp.cend()}

HTTP errors that will be handled like connection errors.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerSleepError

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::crawlerSleepError {defaultSleepErrorMs}

Sleeping time on connection errors, in milliseconds.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerSleepHttp

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::crawlerSleepHttp {defaultSleepHttpMs}

Time that will be waited between HTTP requests, in milliseconds.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerSleepIdle

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::crawlerSleepIdle {defaultSleepIdleMs}

Time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerSleepMySql

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::crawlerSleepMySql {defaultSleepMySqlS}

Time to wait before trying to re-connect to the MySQL server, in seconds.

Note: After that time, the current database operation will be lost.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerStart

std::string crawlservpp::Module::Crawler::Config::Entries::crawlerStart {"/"}

Starting point for crawling (should start with / except for cross-domain websites).

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerStartIgnore

bool crawlservpp::Module::Crawler::Config::Entries::crawlerStartIgnore {false}

Specifies whether to not crawl the start page.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerTidyErrors

std::uint32_t crawlservpp::Module::Crawler::Config::Entries::crawlerTidyErrors {}

Number of tidyhtml errors to log (if logging is enabled).

Referenced by crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerTidyWarnings

bool crawlservpp::Module::Crawler::Config::Entries::crawlerTidyWarnings {false}

Specifies whether to log tidyhtml warnings (if logging is enabled).

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerTiming

bool crawlservpp::Module::Crawler::Config::Entries::crawlerTiming {false}

Specifies whether to calculate timing statistics for the crawler.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), crawlservpp::Module::Crawler::Thread::onTick(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerUrlCaseSensitive

bool crawlservpp::Module::Crawler::Config::Entries::crawlerUrlCaseSensitive {true}

Specifies whether URLs are case-sensitive.

Warning: Changes invalidate the hashs of existing URLs!

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerUrlChunks

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::crawlerUrlChunks {defaultUrlChunks}

Number of crawled URLs to be processed at once without possible interruption.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerUrlDebug

bool crawlservpp::Module::Crawler::Config::Entries::crawlerUrlDebug {false}

Specifies whether to perform additional check for duplicates after URL insertion.

For debugging purposes only.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerUrlMaxLength

std::uint16_t crawlservpp::Module::Crawler::Config::Entries::crawlerUrlMaxLength {defaultUrlMaxLength}

Maximum length of URLs to add.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerUrlStartupCheck

bool crawlservpp::Module::Crawler::Config::Entries::crawlerUrlStartupCheck {true}

Specifies whether to check the URL list before starting to crawl.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerWarningsFile

bool crawlservpp::Module::Crawler::Config::Entries::crawlerWarningsFile {false}

Specifies whether to warn when files are found (as opposed to folders).

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ crawlerXml

bool crawlservpp::Module::Crawler::Config::Entries::crawlerXml {false}

Specifies whether to always save crawled content as cleaned XML.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCounters

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customCounters

List of counter variables to be replaced in custom URLs.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCountersAlias

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customCountersAlias

Alias for the counter variable with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCountersAliasAdd

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::customCountersAliasAdd

Value to add to the counter alias with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCountersEnd

std::vector<std::int64_t> crawlservpp::Module::Crawler::Config::Entries::customCountersEnd

End value for the counter with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCountersGlobal

bool crawlservpp::Module::Crawler::Config::Entries::customCountersGlobal {true}

Specifies whether to use every counter for all custom URLs.

Otherwise a counters will only be used for URLs with the same array index.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCountersStart

std::vector<std::int64_t> crawlservpp::Module::Crawler::Config::Entries::customCountersStart

Start value for the counter with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customCountersStep

std::vector<std::int64_t> crawlservpp::Module::Crawler::Config::Entries::customCountersStep

Step value for the counter with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customReCrawl

bool crawlservpp::Module::Crawler::Config::Entries::customReCrawl {true}

Specifies whether to always re-crawl custom URLs.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customRobots

bool crawlservpp::Module::Crawler::Config::Entries::customRobots {false}

Specifies whether to add the sitemaps specified in robots.txt as custom URLs.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokenHeaders

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customTokenHeaders

Custom HTTP headers to be used for ALL tokens.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokens

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customTokens

List of token variables to be replaced in custom URLs.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokensCookies

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customTokensCookies

Custom HTTP Cookie header for the token with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokensKeep

std::vector<std::uint32_t> crawlservpp::Module::Crawler::Config::Entries::customTokensKeep

Time until the token with the same array index gets invalid, in seconds.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokensQuery

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::customTokensQuery

Query to extract the token with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokensRequired

std::vector<bool> crawlservpp::Module::Crawler::Config::Entries::customTokensRequired

Determines whether an error occurs if the token with the same array index is empty.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokensSource

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customTokensSource

Source URL for the token with the same array index (absolute link without protocol).

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customTokensUsePost

std::vector<bool> crawlservpp::Module::Crawler::Config::Entries::customTokensUsePost

Use HTTP POST instead of GET for the token with the same array index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customUrls

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::customUrls

Custom URLs for crawling (should all start with / except for cross-domain websites).

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ customUsePost

bool crawlservpp::Module::Crawler::Config::Entries::customUsePost {false}

Specifies whether to use HTTP POST instead of HTTP GET to retrieve custom URLs.

Has no effect after dynamic redirects.

See also: redirectUsePost

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ expectedErrorIfLarger

bool crawlservpp::Module::Crawler::Config::Entries::expectedErrorIfLarger {false}

Specifies whether to throw an exception when number of expected URLs is exceeded.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ expectedErrorIfSmaller

bool crawlservpp::Module::Crawler::Config::Entries::expectedErrorIfSmaller {false}

Specifies whether to throw an exception when number of expected URLs is subceeded.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ expectedQuery

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::expectedQuery {}

Query to be performed on content to retrieve the expected number of URLs.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectCookies

std::string crawlservpp::Module::Crawler::Config::Entries::redirectCookies

Custom HTTP Cookie header on dynamic redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectHeaders

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::redirectHeaders

Custom HTTP headers on dynamic redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectQueryContent

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::redirectQueryContent {}

Query on content that specifies whether to redirect to different URL.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectQueryUrl

std::uint64_t crawlservpp::Module::Crawler::Config::Entries::redirectQueryUrl {}

Query on URL that specifies whether to redirect to different URL.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectTo

std::string crawlservpp::Module::Crawler::Config::Entries::redirectTo

Sub-URL (for cross-domain websites: URL without protocol) to redirect to.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectUsePost

bool crawlservpp::Module::Crawler::Config::Entries::redirectUsePost {false}

Specifies whether to use HTTP POST instead of HTTP GET to retrieve a URL after redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectVarNames

std::vector<std::string> crawlservpp::Module::Crawler::Config::Entries::redirectVarNames

Variable names to be replaced on redirect.

Names will be replaced in the values of Config::Entries::redirectTo, Config::Entries::redirectCookies, and Config::Entries::redirectHeaders.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectVarQueries

std::vector<std::uint64_t> crawlservpp::Module::Crawler::Config::Entries::redirectVarQueries

Query on variable source to retrieve the value of the variable with the same index.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), and crawlservpp::Module::Crawler::Config::parseOption().

◆ redirectVarSources

std::vector<std::uint8_t> crawlservpp::Module::Crawler::Config::Entries::redirectVarSources

Source type of the variable with the same index.

See also: redirectSourceUrl, redirectContentUrl

Referenced by crawlservpp::Module::Crawler::Config::checkOptions(), crawlservpp::Module::Crawler::Thread::onReset(), and crawlservpp::Module::Crawler::Config::parseOption().

The documentation for this struct was generated from the following file:

Module/Crawler/Config.hpp

Crawler Configuration

Custom URLs

Expected Number of Results

Dynamic Redirect

Detailed Description

Member Data Documentation

◆ crawlerArchives

◆ crawlerArchivesNames

◆ crawlerArchivesOnly

◆ crawlerArchivesUrlsMemento

◆ crawlerArchivesUrlsSkip

◆ crawlerArchivesUrlsTimemap

◆ crawlerLock

◆ crawlerLogging

◆ crawlerMaxBatchSize

◆ crawlerParamsAdd

◆ crawlerParamsBlackList

◆ crawlerParamsWhiteList

◆ crawlerQueriesBlackListContent

◆ crawlerQueriesBlackListTypes

◆ crawlerQueriesBlackListUrls

◆ crawlerQueriesLinks

◆ crawlerQueriesLinksBlackListContent

◆ crawlerQueriesLinksBlackListTypes

◆ crawlerQueriesLinksBlackListUrls

◆ crawlerQueriesLinksWhiteListContent

◆ crawlerQueriesLinksWhiteListTypes

◆ crawlerQueriesLinksWhiteListUrls

◆ crawlerQueriesWhiteListContent

◆ crawlerQueriesWhiteListTypes

◆ crawlerQueriesWhiteListUrls

◆ crawlerReCrawl

◆ crawlerReCrawlAlways

◆ crawlerReCrawlStart

◆ crawlerRemoveXmlInstructions

◆ crawlerRepairCData

◆ crawlerRepairComments

◆ crawlerRestartAfter

◆ crawlerReTries

◆ crawlerRetryArchive

◆ crawlerRetryEmpty

◆ crawlerRetryHttp

◆ crawlerSleepError

◆ crawlerSleepHttp

◆ crawlerSleepIdle

◆ crawlerSleepMySql

◆ crawlerStart

◆ crawlerStartIgnore

◆ crawlerTidyErrors

◆ crawlerTidyWarnings

◆ crawlerTiming

◆ crawlerUrlCaseSensitive

◆ crawlerUrlChunks

◆ crawlerUrlDebug

◆ crawlerUrlMaxLength

◆ crawlerUrlStartupCheck

◆ crawlerWarningsFile

◆ crawlerXml

◆ customCounters

◆ customCountersAlias

◆ customCountersAliasAdd

◆ customCountersEnd

◆ customCountersGlobal

◆ customCountersStart

◆ customCountersStep

◆ customReCrawl

◆ customRobots

◆ customTokenHeaders

◆ customTokens

◆ customTokensCookies

◆ customTokensKeep

◆ customTokensQuery

◆ customTokensRequired

◆ customTokensSource

◆ customTokensUsePost

◆ customUrls

◆ customUsePost

◆ expectedErrorIfLarger

◆ expectedErrorIfSmaller

◆ expectedQuery