Namespace for crawler classes. More...

Classes
class	Config
	Configuration for crawlers. More...

class	Database
	Class providing database functionality for crawler threads by implementing Wrapper::Database. More...

class	Thread
	Crawler thread. More...

Constants
constexpr std::uint8_t	crawlerLoggingSilent {0}
	Logging is disabled. More...

constexpr std::uint8_t	crawlerLoggingDefault {1}
	Default logging is enabled. More...

constexpr std::uint8_t	crawlerLoggingExtended {2}
	Extended logging is enabled. More...

constexpr std::uint8_t	crawlerLoggingVerbose {3}
	Verbose logging is enabled. More...

constexpr std::uint8_t	redirectSourceUrl {0}
	Performing a query on the URL of a crawled web page to determine whether to redirect. More...

constexpr std::uint8_t	redirectSourceContent {1}
	Performing a query on the content of a crawled web page to determine whether to redirect. More...

constexpr std::uint32_t	defaultCrawlerLockS {300}
	Default time to lock URLs that are being processed, in seconds. More...

constexpr std::uint16_t	defaultMaxBatchSize {500}
	Default number of URLs to be processed in one MySQL query. More...

constexpr std::int32_t	defaultRestartAfter {-1}
	Default time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated). More...

constexpr std::int64_t	defaultReTries {720}
	Default number of re-tries on connection errors. More...

constexpr std::array	defaultRetryHttp {429, 502, 503, 504, 521, 522, 524}
	HTTP errors that will be handled like connection errors by default. More...

constexpr std::uint64_t	defaultSleepErrorMs {10000}
	Default sleeping time on connection errors, in milliseconds. More...

constexpr std::uint64_t	defaultSleepHttpMs {0}
	Default time that will be waited between HTTP requests, in milliseconds. More...

constexpr std::uint64_t	defaultSleepIdleMs {5000}
	Default time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds. More...

constexpr std::uint64_t	defaultSleepMySqlS {60}
	Default time to wait before the first try to re-connect to the MySQL server, in seconds. More...

constexpr std::uint64_t	defaultUrlChunks {5000}
	Default number of crawled URLs to be processed at once without possible interruption. More...

constexpr std::uint16_t	defaultUrlMaxLength {2000}
	Default maximum length of URLs to add. More...

constexpr auto	maxContentSize {1073741824}
	Maximum size of database content (= 1 GiB). More...

constexpr auto	maxContentSizeString {"1 GiB"sv}
	Maximum size of database content as string. More...

constexpr auto	robotsMinLineLength {9}
	The minimum length of a robots.txt line containing a useful sitemap. More...

constexpr auto	robotsFirstLetters {7}
	The first letters of a robots.txt line containing a sitemap. More...

constexpr auto	robotsSitemapBegin {"sitemap:"sv}
	The beginning of a robots.txt line containing a sitemap. More...

constexpr auto	robotsRelativeUrl {"/robots.txt"sv}
	The relative URL of robots.txt. More...

constexpr auto	updateCustomUrlCountEvery {100}
	The number of custom URLs after which the thread status will be updated. More...

constexpr auto	httpResponseCodeMin {400}
	Minimum HTTP error code. More...

constexpr auto	httpResponseCodeMax {599}
	Maximum HTTP error code. More...

constexpr auto	httpResponseCodeIgnore {200}
	HTTP response code to be ignored when checking for errors. More...

constexpr auto	wwwString {"www."sv}
	The "www." in the beginning of a domain. More...

constexpr auto	httpsString {"https://"sv}
	The beginning of a URL containing the HTTPS protocol. More...

constexpr auto	httpsIgnoreString {"https://www."sv}
	The beginning of a HTTPS URL to be ignored. More...

constexpr auto	httpString {"http://"sv}
	The beginning of a URL containing the HTTP protocol. More...

constexpr auto	httpIgnoreString {"http://www."sv}
	The beginning of a HTTP URL to be ignored. More...

constexpr auto	archiveMementoContentType {"application/link-format"sv}
	The content type of a memento. More...

constexpr auto	archiveRefString {"found capture at "sv}
	The reference string in a memento referencing another memento. More...

constexpr auto	archiveRefTimeStampLength {14}
	The length of a memento time stamp. More...

constexpr auto	archiveRenewUrlLockEveryMs {1000}
	Number of milliseconds before renewing URL lock while crawling archives. More...

Constants for MySQL Queries
constexpr auto	nAtOnce10 {10}
	Process ten values at once. More...

constexpr auto	nAtOnce100 {100}
	Process one hundred values at once. More...

constexpr auto	sqlArg1 {1}
	First argument in a SQL query. More...

constexpr auto	sqlArg2 {2}
	Second argument in a SQL query. More...

constexpr auto	sqlArg3 {3}
	Third argument in a SQL query. More...

constexpr auto	sqlArg4 {4}
	Fourth argument in a SQL query. More...

constexpr auto	sqlArg5 {5}
	Fifth argument in a SQL query. More...

constexpr auto	crawlingTableAlias {"a"sv}
	Alias, used in SQL queries, for the crawling table. More...

constexpr auto	urlListTableAlias {"b"sv}
	Alias, used in SQL queries, for the URL list table. More...

constexpr auto	numArgsAddUrl {5}
	Number of arguments for adding one URL. More...

Detailed Description

Namespace for crawler classes.

Variable Documentation

◆ archiveMementoContentType

constexpr auto crawlservpp::Module::Crawler::archiveMementoContentType {"application/link-format"sv}

inline

The content type of a memento.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ archiveRefString

constexpr auto crawlservpp::Module::Crawler::archiveRefString {"found capture at "sv}

inline

The reference string in a memento referencing another memento.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ archiveRefTimeStampLength

constexpr auto crawlservpp::Module::Crawler::archiveRefTimeStampLength {14}

inline

The length of a memento time stamp.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ archiveRenewUrlLockEveryMs

constexpr auto crawlservpp::Module::Crawler::archiveRenewUrlLockEveryMs {1000}

inline

Number of milliseconds before renewing URL lock while crawling archives.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlerLoggingDefault

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingDefault {1}

inline

Default logging is enabled.

Referenced by crawlservpp::Module::Crawler::Thread::onClear(), and crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlerLoggingExtended

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingExtended {2}

inline

Extended logging is enabled.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlerLoggingSilent

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingSilent {0}

inline

Logging is disabled.

◆ crawlerLoggingVerbose

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingVerbose {3}

inline

Verbose logging is enabled.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlingTableAlias

constexpr auto crawlservpp::Module::Crawler::crawlingTableAlias {"a"sv}

inline

Alias, used in SQL queries, for the crawling table.

Referenced by crawlservpp::Module::Crawler::Database::prepare().

◆ defaultCrawlerLockS

constexpr std::uint32_t crawlservpp::Module::Crawler::defaultCrawlerLockS {300}

inline

Default time to lock URLs that are being processed, in seconds.

◆ defaultMaxBatchSize

constexpr std::uint16_t crawlservpp::Module::Crawler::defaultMaxBatchSize {500}

inline

Default number of URLs to be processed in one MySQL query.

◆ defaultRestartAfter

constexpr std::int32_t crawlservpp::Module::Crawler::defaultRestartAfter {-1}

inline

Default time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated).

◆ defaultReTries

constexpr std::int64_t crawlservpp::Module::Crawler::defaultReTries {720}

inline

Default number of re-tries on connection errors.

◆ defaultRetryHttp

constexpr std::array crawlservpp::Module::Crawler::defaultRetryHttp {429, 502, 503, 504, 521, 522, 524}

inline

HTTP errors that will be handled like connection errors by default.

◆ defaultSleepErrorMs

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepErrorMs {10000}

inline

Default sleeping time on connection errors, in milliseconds.

◆ defaultSleepHttpMs

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepHttpMs {0}

inline

Default time that will be waited between HTTP requests, in milliseconds.

◆ defaultSleepIdleMs

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepIdleMs {5000}

inline

Default time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds.

◆ defaultSleepMySqlS

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepMySqlS {60}

inline

Default time to wait before the first try to re-connect to the MySQL server, in seconds.

◆ defaultUrlChunks

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultUrlChunks {5000}

inline

Default number of crawled URLs to be processed at once without possible interruption.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions().

◆ defaultUrlMaxLength

constexpr std::uint16_t crawlservpp::Module::Crawler::defaultUrlMaxLength {2000}

inline

Default maximum length of URLs to add.

◆ httpIgnoreString

constexpr auto crawlservpp::Module::Crawler::httpIgnoreString {"http://www."sv}

inline

The beginning of a HTTP URL to be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpResponseCodeIgnore

constexpr auto crawlservpp::Module::Crawler::httpResponseCodeIgnore {200}

inline

HTTP response code to be ignored when checking for errors.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpResponseCodeMax

constexpr auto crawlservpp::Module::Crawler::httpResponseCodeMax {599}

inline

Maximum HTTP error code.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpResponseCodeMin

constexpr auto crawlservpp::Module::Crawler::httpResponseCodeMin {400}

inline

Minimum HTTP error code.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpsIgnoreString

constexpr auto crawlservpp::Module::Crawler::httpsIgnoreString {"https://www."sv}

inline

The beginning of a HTTPS URL to be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpsString

constexpr auto crawlservpp::Module::Crawler::httpsString {"https://"sv}

inline

The beginning of a URL containing the HTTPS protocol.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpString

constexpr auto crawlservpp::Module::Crawler::httpString {"http://"sv}

inline

The beginning of a URL containing the HTTP protocol.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ maxContentSize

constexpr auto crawlservpp::Module::Crawler::maxContentSize {1073741824}

inline

Maximum size of database content (= 1 GiB).

Referenced by crawlservpp::Module::Crawler::Database::saveArchivedContent(), and crawlservpp::Module::Crawler::Database::saveContent().

◆ maxContentSizeString

constexpr auto crawlservpp::Module::Crawler::maxContentSizeString {"1 GiB"sv}

inline

Maximum size of database content as string.

Referenced by crawlservpp::Module::Crawler::Database::saveArchivedContent(), and crawlservpp::Module::Crawler::Database::saveContent().

◆ nAtOnce10

constexpr auto crawlservpp::Module::Crawler::nAtOnce10 {10}

inline

Process ten values at once.

Referenced by crawlservpp::Module::Crawler::Database::addUrlsIfNotExist(), and crawlservpp::Module::Crawler::Database::prepare().

◆ nAtOnce100

constexpr auto crawlservpp::Module::Crawler::nAtOnce100 {100}

inline

Process one hundred values at once.

Referenced by crawlservpp::Module::Crawler::Database::addUrlsIfNotExist(), and crawlservpp::Module::Crawler::Database::prepare().

◆ numArgsAddUrl

constexpr auto crawlservpp::Module::Crawler::numArgsAddUrl {5}

inline

Number of arguments for adding one URL.

Referenced by crawlservpp::Module::Crawler::Database::addUrlsIfNotExist().

◆ redirectSourceContent

constexpr std::uint8_t crawlservpp::Module::Crawler::redirectSourceContent {1}

inline

Performing a query on the content of a crawled web page to determine whether to redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ redirectSourceUrl

constexpr std::uint8_t crawlservpp::Module::Crawler::redirectSourceUrl {0}

inline

Performing a query on the URL of a crawled web page to determine whether to redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsFirstLetters

constexpr auto crawlservpp::Module::Crawler::robotsFirstLetters {7}

inline

The first letters of a robots.txt line containing a sitemap.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsMinLineLength

constexpr auto crawlservpp::Module::Crawler::robotsMinLineLength {9}

inline

The minimum length of a robots.txt line containing a useful sitemap.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsRelativeUrl

constexpr auto crawlservpp::Module::Crawler::robotsRelativeUrl {"/robots.txt"sv}

inline

The relative URL of robots.txt.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsSitemapBegin

constexpr auto crawlservpp::Module::Crawler::robotsSitemapBegin {"sitemap:"sv}

inline

The beginning of a robots.txt line containing a sitemap.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ sqlArg1

constexpr auto crawlservpp::Module::Crawler::sqlArg1 {1}

inline

First argument in a SQL query.

◆ sqlArg2

constexpr auto crawlservpp::Module::Crawler::sqlArg2 {2}

inline

Second argument in a SQL query.

Referenced by crawlservpp::Module::Crawler::Database::addUrlIfNotExists(), crawlservpp::Module::Crawler::Database::addUrlsIfNotExist(), crawlservpp::Module::Crawler::Database::getUrlId(), crawlservpp::Module::Crawler::Database::isArchivedContentExists(), crawlservpp::Module::Crawler::Database::lockUrlIfOk(), crawlservpp::Module::Crawler::Database::saveArchivedContent(), crawlservpp::Module::Crawler::Database::saveContent(), crawlservpp::Module::Crawler::Database::setUrlFinishedIfOk(), and crawlservpp::Module::Crawler::Database::unLockUrlIfOk().

◆ sqlArg3

constexpr auto crawlservpp::Module::Crawler::sqlArg3 {3}

inline

Third argument in a SQL query.

Referenced by crawlservpp::Module::Crawler::Database::addUrlIfNotExists(), crawlservpp::Module::Crawler::Database::addUrlsIfNotExist(), crawlservpp::Module::Crawler::Database::isArchivedContentExists(), crawlservpp::Module::Crawler::Database::lockUrlIfOk(), crawlservpp::Module::Crawler::Database::saveArchivedContent(), and crawlservpp::Module::Crawler::Database::saveContent().

◆ sqlArg4

constexpr auto crawlservpp::Module::Crawler::sqlArg4 {4}

inline

Fourth argument in a SQL query.

Referenced by crawlservpp::Module::Crawler::Database::addUrlIfNotExists(), crawlservpp::Module::Crawler::Database::addUrlsIfNotExist(), crawlservpp::Module::Crawler::Database::lockUrlIfOk(), crawlservpp::Module::Crawler::Database::saveArchivedContent(), and crawlservpp::Module::Crawler::Database::saveContent().

◆ sqlArg5

constexpr auto crawlservpp::Module::Crawler::sqlArg5 {5}

inline

Fifth argument in a SQL query.

Referenced by crawlservpp::Module::Crawler::Database::addUrlIfNotExists(), crawlservpp::Module::Crawler::Database::addUrlsIfNotExist(), and crawlservpp::Module::Crawler::Database::saveArchivedContent().

◆ updateCustomUrlCountEvery

constexpr auto crawlservpp::Module::Crawler::updateCustomUrlCountEvery {100}

inline

The number of custom URLs after which the thread status will be updated.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ urlListTableAlias

constexpr auto crawlservpp::Module::Crawler::urlListTableAlias {"b"sv}

inline

Alias, used in SQL queries, for the URL list table.

Referenced by crawlservpp::Module::Crawler::Database::isArchivedContentExists(), and crawlservpp::Module::Crawler::Database::prepare().

◆ wwwString

constexpr auto crawlservpp::Module::Crawler::wwwString {"www."sv}

inline

The "www." in the beginning of a domain.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

Classes

Constants

Constants for MySQL Queries

Detailed Description

Variable Documentation

◆ archiveMementoContentType

◆ archiveRefString

◆ archiveRefTimeStampLength

◆ archiveRenewUrlLockEveryMs

◆ crawlerLoggingDefault

◆ crawlerLoggingExtended

◆ crawlerLoggingSilent

◆ crawlerLoggingVerbose

◆ crawlingTableAlias

◆ defaultCrawlerLockS

◆ defaultMaxBatchSize

◆ defaultRestartAfter

◆ defaultReTries

◆ defaultRetryHttp

◆ defaultSleepErrorMs

◆ defaultSleepHttpMs

◆ defaultSleepIdleMs

◆ defaultSleepMySqlS

◆ defaultUrlChunks

◆ defaultUrlMaxLength

◆ httpIgnoreString

◆ httpResponseCodeIgnore

◆ httpResponseCodeMax

◆ httpResponseCodeMin

◆ httpsIgnoreString

◆ httpsString

◆ httpString

◆ maxContentSize

◆ maxContentSizeString

◆ nAtOnce10

◆ nAtOnce100

◆ numArgsAddUrl

◆ redirectSourceContent

◆ redirectSourceUrl

◆ robotsFirstLetters

◆ robotsMinLineLength

◆ robotsRelativeUrl

◆ robotsSitemapBegin

◆ sqlArg1

◆ sqlArg2

◆ sqlArg3

◆ sqlArg4

◆ sqlArg5

◆ updateCustomUrlCountEvery

◆ urlListTableAlias

◆ wwwString