crawlserv++
[under development]
Application for crawling and analyzing textual content of websites.
|
#include "Config.hpp"
#include "Database.hpp"
#include "../Thread.hpp"
#include "../../Helper/CommaLocale.hpp"
#include "../../Helper/Container.hpp"
#include "../../Helper/DateTime.hpp"
#include "../../Helper/DotLocale.hpp"
#include "../../Helper/Strings.hpp"
#include "../../Helper/Utf8.hpp"
#include "../../Main/Exception.hpp"
#include "../../Network/Curl.hpp"
#include "../../Network/TorControl.hpp"
#include "../../Parsing/URI.hpp"
#include "../../Query/Container.hpp"
#include "../../Struct/CrawlStatsTick.hpp"
#include "../../Struct/CrawlTimersTick.hpp"
#include "../../Struct/CrawlTimersContent.hpp"
#include "../../Struct/NetworkSettings.hpp"
#include "../../Struct/QueryProperties.hpp"
#include "../../Struct/QueryStruct.hpp"
#include "../../Struct/ThreadOptions.hpp"
#include "../../Struct/ThreadStatus.hpp"
#include "../../Timer/Simple.hpp"
#include "../../Wrapper/DatabaseLock.hpp"
#include "../../Wrapper/DatabaseTryLock.hpp"
#include <curl/curl.h>
#include <algorithm>
#include <cctype>
#include <chrono>
#include <cstddef>
#include <cstdint>
#include <iomanip>
#include <ios>
#include <memory>
#include <queue>
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>
#include <utility>
#include <vector>
Go to the source code of this file.
Classes | |
class | crawlservpp::Module::Crawler::Thread |
Crawler thread. More... | |
class | crawlservpp::Module::Crawler::Thread::Exception |
Class for crawler exceptions. More... | |
Namespaces | |
crawlservpp::Module::Crawler | |
Namespace for crawler classes. | |
Constants | |
constexpr auto | crawlservpp::Module::Crawler::robotsMinLineLength {9} |
The minimum length of a robots.txt line containing a useful sitemap. More... | |
constexpr auto | crawlservpp::Module::Crawler::robotsFirstLetters {7} |
The first letters of a robots.txt line containing a sitemap. More... | |
constexpr auto | crawlservpp::Module::Crawler::robotsSitemapBegin {"sitemap:"sv} |
The beginning of a robots.txt line containing a sitemap. More... | |
constexpr auto | crawlservpp::Module::Crawler::robotsRelativeUrl {"/robots.txt"sv} |
The relative URL of robots.txt. More... | |
constexpr auto | crawlservpp::Module::Crawler::updateCustomUrlCountEvery {100} |
The number of custom URLs after which the thread status will be updated. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpResponseCodeMin {400} |
Minimum HTTP error code. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpResponseCodeMax {599} |
Maximum HTTP error code. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpResponseCodeIgnore {200} |
HTTP response code to be ignored when checking for errors. More... | |
constexpr auto | crawlservpp::Module::Crawler::wwwString {"www."sv} |
The "www." in the beginning of a domain. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpsString {"https://"sv} |
The beginning of a URL containing the HTTPS protocol. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpsIgnoreString {"https://www."sv} |
The beginning of a HTTPS URL to be ignored. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpString {"http://"sv} |
The beginning of a URL containing the HTTP protocol. More... | |
constexpr auto | crawlservpp::Module::Crawler::httpIgnoreString {"http://www."sv} |
The beginning of a HTTP URL to be ignored. More... | |
constexpr auto | crawlservpp::Module::Crawler::archiveMementoContentType {"application/link-format"sv} |
The content type of a memento. More... | |
constexpr auto | crawlservpp::Module::Crawler::archiveRefString {"found capture at "sv} |
The reference string in a memento referencing another memento. More... | |
constexpr auto | crawlservpp::Module::Crawler::archiveRefTimeStampLength {14} |
The length of a memento time stamp. More... | |
constexpr auto | crawlservpp::Module::Crawler::archiveRenewUrlLockEveryMs {1000} |
Number of milliseconds before renewing URL lock while crawling archives. More... | |