|
crawlserv++
[under development]
Application for crawling and analyzing textual content of websites.
|
#include "Config.hpp"#include "Database.hpp"#include "../Thread.hpp"#include "../../Helper/CommaLocale.hpp"#include "../../Helper/Container.hpp"#include "../../Helper/DateTime.hpp"#include "../../Helper/DotLocale.hpp"#include "../../Helper/Strings.hpp"#include "../../Helper/Utf8.hpp"#include "../../Main/Exception.hpp"#include "../../Network/Curl.hpp"#include "../../Network/TorControl.hpp"#include "../../Parsing/URI.hpp"#include "../../Query/Container.hpp"#include "../../Struct/CrawlStatsTick.hpp"#include "../../Struct/CrawlTimersTick.hpp"#include "../../Struct/CrawlTimersContent.hpp"#include "../../Struct/NetworkSettings.hpp"#include "../../Struct/QueryProperties.hpp"#include "../../Struct/QueryStruct.hpp"#include "../../Struct/ThreadOptions.hpp"#include "../../Struct/ThreadStatus.hpp"#include "../../Timer/Simple.hpp"#include "../../Wrapper/DatabaseLock.hpp"#include "../../Wrapper/DatabaseTryLock.hpp"#include <curl/curl.h>#include <algorithm>#include <cctype>#include <chrono>#include <cstddef>#include <cstdint>#include <iomanip>#include <ios>#include <memory>#include <queue>#include <sstream>#include <stdexcept>#include <string>#include <string_view>#include <utility>#include <vector>

Go to the source code of this file.
Classes | |
| class | crawlservpp::Module::Crawler::Thread |
| Crawler thread. More... | |
| class | crawlservpp::Module::Crawler::Thread::Exception |
| Class for crawler exceptions. More... | |
Namespaces | |
| crawlservpp::Module::Crawler | |
| Namespace for crawler classes. | |
Constants | |
| constexpr auto | crawlservpp::Module::Crawler::robotsMinLineLength {9} |
| The minimum length of a robots.txt line containing a useful sitemap. More... | |
| constexpr auto | crawlservpp::Module::Crawler::robotsFirstLetters {7} |
| The first letters of a robots.txt line containing a sitemap. More... | |
| constexpr auto | crawlservpp::Module::Crawler::robotsSitemapBegin {"sitemap:"sv} |
| The beginning of a robots.txt line containing a sitemap. More... | |
| constexpr auto | crawlservpp::Module::Crawler::robotsRelativeUrl {"/robots.txt"sv} |
| The relative URL of robots.txt. More... | |
| constexpr auto | crawlservpp::Module::Crawler::updateCustomUrlCountEvery {100} |
| The number of custom URLs after which the thread status will be updated. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpResponseCodeMin {400} |
| Minimum HTTP error code. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpResponseCodeMax {599} |
| Maximum HTTP error code. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpResponseCodeIgnore {200} |
| HTTP response code to be ignored when checking for errors. More... | |
| constexpr auto | crawlservpp::Module::Crawler::wwwString {"www."sv} |
| The "www." in the beginning of a domain. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpsString {"https://"sv} |
| The beginning of a URL containing the HTTPS protocol. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpsIgnoreString {"https://www."sv} |
| The beginning of a HTTPS URL to be ignored. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpString {"http://"sv} |
| The beginning of a URL containing the HTTP protocol. More... | |
| constexpr auto | crawlservpp::Module::Crawler::httpIgnoreString {"http://www."sv} |
| The beginning of a HTTP URL to be ignored. More... | |
| constexpr auto | crawlservpp::Module::Crawler::archiveMementoContentType {"application/link-format"sv} |
| The content type of a memento. More... | |
| constexpr auto | crawlservpp::Module::Crawler::archiveRefString {"found capture at "sv} |
| The reference string in a memento referencing another memento. More... | |
| constexpr auto | crawlservpp::Module::Crawler::archiveRefTimeStampLength {14} |
| The length of a memento time stamp. More... | |
| constexpr auto | crawlservpp::Module::Crawler::archiveRenewUrlLockEveryMs {1000} |
| Number of milliseconds before renewing URL lock while crawling archives. More... | |