crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
crawlservpp::Module::Crawler Namespace Reference

Namespace for crawler classes. More...

Classes

class  Config
 Configuration for crawlers. More...
 
class  Database
 Class providing database functionality for crawler threads by implementing Wrapper::Database. More...
 
class  Thread
 Crawler thread. More...
 

Constants

constexpr std::uint8_t crawlerLoggingSilent {0}
 Logging is disabled. More...
 
constexpr std::uint8_t crawlerLoggingDefault {1}
 Default logging is enabled. More...
 
constexpr std::uint8_t crawlerLoggingExtended {2}
 Extended logging is enabled. More...
 
constexpr std::uint8_t crawlerLoggingVerbose {3}
 Verbose logging is enabled. More...
 
constexpr std::uint8_t redirectSourceUrl {0}
 Performing a query on the URL of a crawled web page to determine whether to redirect. More...
 
constexpr std::uint8_t redirectSourceContent {1}
 Performing a query on the content of a crawled web page to determine whether to redirect. More...
 
constexpr std::uint32_t defaultCrawlerLockS {300}
 Default time to lock URLs that are being processed, in seconds. More...
 
constexpr std::uint16_t defaultMaxBatchSize {500}
 Default number of URLs to be processed in one MySQL query. More...
 
constexpr std::int32_t defaultRestartAfter {-1}
 Default time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated). More...
 
constexpr std::int64_t defaultReTries {720}
 Default number of re-tries on connection errors. More...
 
constexpr std::array defaultRetryHttp {429, 502, 503, 504, 521, 522, 524}
 HTTP errors that will be handled like connection errors by default. More...
 
constexpr std::uint64_t defaultSleepErrorMs {10000}
 Default sleeping time on connection errors, in milliseconds. More...
 
constexpr std::uint64_t defaultSleepHttpMs {0}
 Default time that will be waited between HTTP requests, in milliseconds. More...
 
constexpr std::uint64_t defaultSleepIdleMs {5000}
 Default time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds. More...
 
constexpr std::uint64_t defaultSleepMySqlS {60}
 Default time to wait before the first try to re-connect to the MySQL server, in seconds. More...
 
constexpr std::uint64_t defaultUrlChunks {5000}
 Default number of crawled URLs to be processed at once without possible interruption. More...
 
constexpr std::uint16_t defaultUrlMaxLength {2000}
 Default maximum length of URLs to add. More...
 
constexpr auto maxContentSize {1073741824}
 Maximum size of database content (= 1 GiB). More...
 
constexpr auto maxContentSizeString {"1 GiB"sv}
 Maximum size of database content as string. More...
 
constexpr auto robotsMinLineLength {9}
 The minimum length of a robots.txt line containing a useful sitemap. More...
 
constexpr auto robotsFirstLetters {7}
 The first letters of a robots.txt line containing a sitemap. More...
 
constexpr auto robotsSitemapBegin {"sitemap:"sv}
 The beginning of a robots.txt line containing a sitemap. More...
 
constexpr auto robotsRelativeUrl {"/robots.txt"sv}
 The relative URL of robots.txt. More...
 
constexpr auto updateCustomUrlCountEvery {100}
 The number of custom URLs after which the thread status will be updated. More...
 
constexpr auto httpResponseCodeMin {400}
 Minimum HTTP error code. More...
 
constexpr auto httpResponseCodeMax {599}
 Maximum HTTP error code. More...
 
constexpr auto httpResponseCodeIgnore {200}
 HTTP response code to be ignored when checking for errors. More...
 
constexpr auto wwwString {"www."sv}
 The "www." in the beginning of a domain. More...
 
constexpr auto httpsString {"https://"sv}
 The beginning of a URL containing the HTTPS protocol. More...
 
constexpr auto httpsIgnoreString {"https://www."sv}
 The beginning of a HTTPS URL to be ignored. More...
 
constexpr auto httpString {"http://"sv}
 The beginning of a URL containing the HTTP protocol. More...
 
constexpr auto httpIgnoreString {"http://www."sv}
 The beginning of a HTTP URL to be ignored. More...
 
constexpr auto archiveMementoContentType {"application/link-format"sv}
 The content type of a memento. More...
 
constexpr auto archiveRefString {"found capture at "sv}
 The reference string in a memento referencing another memento. More...
 
constexpr auto archiveRefTimeStampLength {14}
 The length of a memento time stamp. More...
 
constexpr auto archiveRenewUrlLockEveryMs {1000}
 Number of milliseconds before renewing URL lock while crawling archives. More...
 

Constants for MySQL Queries

constexpr auto nAtOnce10 {10}
 Process ten values at once. More...
 
constexpr auto nAtOnce100 {100}
 Process one hundred values at once. More...
 
constexpr auto sqlArg1 {1}
 First argument in a SQL query. More...
 
constexpr auto sqlArg2 {2}
 Second argument in a SQL query. More...
 
constexpr auto sqlArg3 {3}
 Third argument in a SQL query. More...
 
constexpr auto sqlArg4 {4}
 Fourth argument in a SQL query. More...
 
constexpr auto sqlArg5 {5}
 Fifth argument in a SQL query. More...
 
constexpr auto crawlingTableAlias {"a"sv}
 Alias, used in SQL queries, for the crawling table. More...
 
constexpr auto urlListTableAlias {"b"sv}
 Alias, used in SQL queries, for the URL list table. More...
 
constexpr auto numArgsAddUrl {5}
 Number of arguments for adding one URL. More...
 

Detailed Description

Namespace for crawler classes.

Variable Documentation

◆ archiveMementoContentType

constexpr auto crawlservpp::Module::Crawler::archiveMementoContentType {"application/link-format"sv}
inline

The content type of a memento.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ archiveRefString

constexpr auto crawlservpp::Module::Crawler::archiveRefString {"found capture at "sv}
inline

The reference string in a memento referencing another memento.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ archiveRefTimeStampLength

constexpr auto crawlservpp::Module::Crawler::archiveRefTimeStampLength {14}
inline

The length of a memento time stamp.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ archiveRenewUrlLockEveryMs

constexpr auto crawlservpp::Module::Crawler::archiveRenewUrlLockEveryMs {1000}
inline

Number of milliseconds before renewing URL lock while crawling archives.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlerLoggingDefault

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingDefault {1}
inline

◆ crawlerLoggingExtended

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingExtended {2}
inline

Extended logging is enabled.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlerLoggingSilent

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingSilent {0}
inline

Logging is disabled.

◆ crawlerLoggingVerbose

constexpr std::uint8_t crawlservpp::Module::Crawler::crawlerLoggingVerbose {3}
inline

Verbose logging is enabled.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ crawlingTableAlias

constexpr auto crawlservpp::Module::Crawler::crawlingTableAlias {"a"sv}
inline

Alias, used in SQL queries, for the crawling table.

Referenced by crawlservpp::Module::Crawler::Database::prepare().

◆ defaultCrawlerLockS

constexpr std::uint32_t crawlservpp::Module::Crawler::defaultCrawlerLockS {300}
inline

Default time to lock URLs that are being processed, in seconds.

◆ defaultMaxBatchSize

constexpr std::uint16_t crawlservpp::Module::Crawler::defaultMaxBatchSize {500}
inline

Default number of URLs to be processed in one MySQL query.

◆ defaultRestartAfter

constexpr std::int32_t crawlservpp::Module::Crawler::defaultRestartAfter {-1}
inline

Default time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated).

◆ defaultReTries

constexpr std::int64_t crawlservpp::Module::Crawler::defaultReTries {720}
inline

Default number of re-tries on connection errors.

◆ defaultRetryHttp

constexpr std::array crawlservpp::Module::Crawler::defaultRetryHttp {429, 502, 503, 504, 521, 522, 524}
inline

HTTP errors that will be handled like connection errors by default.

◆ defaultSleepErrorMs

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepErrorMs {10000}
inline

Default sleeping time on connection errors, in milliseconds.

◆ defaultSleepHttpMs

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepHttpMs {0}
inline

Default time that will be waited between HTTP requests, in milliseconds.

◆ defaultSleepIdleMs

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepIdleMs {5000}
inline

Default time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds.

◆ defaultSleepMySqlS

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultSleepMySqlS {60}
inline

Default time to wait before the first try to re-connect to the MySQL server, in seconds.

◆ defaultUrlChunks

constexpr std::uint64_t crawlservpp::Module::Crawler::defaultUrlChunks {5000}
inline

Default number of crawled URLs to be processed at once without possible interruption.

Referenced by crawlservpp::Module::Crawler::Config::checkOptions().

◆ defaultUrlMaxLength

constexpr std::uint16_t crawlservpp::Module::Crawler::defaultUrlMaxLength {2000}
inline

Default maximum length of URLs to add.

◆ httpIgnoreString

constexpr auto crawlservpp::Module::Crawler::httpIgnoreString {"http://www."sv}
inline

The beginning of a HTTP URL to be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpResponseCodeIgnore

constexpr auto crawlservpp::Module::Crawler::httpResponseCodeIgnore {200}
inline

HTTP response code to be ignored when checking for errors.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpResponseCodeMax

constexpr auto crawlservpp::Module::Crawler::httpResponseCodeMax {599}
inline

Maximum HTTP error code.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpResponseCodeMin

constexpr auto crawlservpp::Module::Crawler::httpResponseCodeMin {400}
inline

Minimum HTTP error code.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpsIgnoreString

constexpr auto crawlservpp::Module::Crawler::httpsIgnoreString {"https://www."sv}
inline

The beginning of a HTTPS URL to be ignored.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpsString

constexpr auto crawlservpp::Module::Crawler::httpsString {"https://"sv}
inline

The beginning of a URL containing the HTTPS protocol.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ httpString

constexpr auto crawlservpp::Module::Crawler::httpString {"http://"sv}
inline

The beginning of a URL containing the HTTP protocol.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ maxContentSize

constexpr auto crawlservpp::Module::Crawler::maxContentSize {1073741824}
inline

◆ maxContentSizeString

constexpr auto crawlservpp::Module::Crawler::maxContentSizeString {"1 GiB"sv}
inline

◆ nAtOnce10

constexpr auto crawlservpp::Module::Crawler::nAtOnce10 {10}
inline

◆ nAtOnce100

constexpr auto crawlservpp::Module::Crawler::nAtOnce100 {100}
inline

◆ numArgsAddUrl

constexpr auto crawlservpp::Module::Crawler::numArgsAddUrl {5}
inline

Number of arguments for adding one URL.

Referenced by crawlservpp::Module::Crawler::Database::addUrlsIfNotExist().

◆ redirectSourceContent

constexpr std::uint8_t crawlservpp::Module::Crawler::redirectSourceContent {1}
inline

Performing a query on the content of a crawled web page to determine whether to redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ redirectSourceUrl

constexpr std::uint8_t crawlservpp::Module::Crawler::redirectSourceUrl {0}
inline

Performing a query on the URL of a crawled web page to determine whether to redirect.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsFirstLetters

constexpr auto crawlservpp::Module::Crawler::robotsFirstLetters {7}
inline

The first letters of a robots.txt line containing a sitemap.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsMinLineLength

constexpr auto crawlservpp::Module::Crawler::robotsMinLineLength {9}
inline

The minimum length of a robots.txt line containing a useful sitemap.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsRelativeUrl

constexpr auto crawlservpp::Module::Crawler::robotsRelativeUrl {"/robots.txt"sv}
inline

The relative URL of robots.txt.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ robotsSitemapBegin

constexpr auto crawlservpp::Module::Crawler::robotsSitemapBegin {"sitemap:"sv}
inline

The beginning of a robots.txt line containing a sitemap.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ sqlArg1

◆ sqlArg2

◆ sqlArg3

◆ sqlArg4

◆ sqlArg5

constexpr auto crawlservpp::Module::Crawler::sqlArg5 {5}
inline

◆ updateCustomUrlCountEvery

constexpr auto crawlservpp::Module::Crawler::updateCustomUrlCountEvery {100}
inline

The number of custom URLs after which the thread status will be updated.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().

◆ urlListTableAlias

constexpr auto crawlservpp::Module::Crawler::urlListTableAlias {"b"sv}
inline

◆ wwwString

constexpr auto crawlservpp::Module::Crawler::wwwString {"www."sv}
inline

The "www." in the beginning of a domain.

Referenced by crawlservpp::Module::Crawler::Thread::onReset().