31 #ifndef MODULE_CRAWLER_THREAD_HPP_ 32 #define MODULE_CRAWLER_THREAD_HPP_ 37 #include "../Thread.hpp" 39 #include "../../Helper/CommaLocale.hpp" 40 #include "../../Helper/Container.hpp" 41 #include "../../Helper/DateTime.hpp" 42 #include "../../Helper/DotLocale.hpp" 43 #include "../../Helper/Strings.hpp" 44 #include "../../Helper/Utf8.hpp" 45 #include "../../Main/Exception.hpp" 46 #include "../../Network/Curl.hpp" 47 #include "../../Network/TorControl.hpp" 48 #include "../../Parsing/URI.hpp" 49 #include "../../Query/Container.hpp" 50 #include "../../Struct/CrawlStatsTick.hpp" 51 #include "../../Struct/CrawlTimersTick.hpp" 52 #include "../../Struct/CrawlTimersContent.hpp" 53 #include "../../Struct/NetworkSettings.hpp" 54 #include "../../Struct/QueryProperties.hpp" 55 #include "../../Struct/QueryStruct.hpp" 56 #include "../../Struct/ThreadOptions.hpp" 57 #include "../../Struct/ThreadStatus.hpp" 58 #include "../../Timer/Simple.hpp" 59 #include "../../Wrapper/DatabaseLock.hpp" 60 #include "../../Wrapper/DatabaseTryLock.hpp" 62 #include <curl/curl.h> 76 #include <string_view> 86 using std::string_view_literals::operator
""sv;
173 using IdString = std::pair<std::uint64_t, std::string>;
174 using TimeString = std::pair<std::chrono::steady_clock::time_point, std::string>;
182 std::string_view cookieDirectory,
190 std::string_view cookieDirectory,
237 std::string timeStamp;
241 const std::string_view cookieDir;
244 std::string urlListTable;
245 std::string crawlingTable;
249 bool noSubDomain{
false};
251 std::unique_ptr<Network::Curl> networkingArchives;
258 std::vector<QueryStruct> queriesBlackListContent;
259 std::vector<QueryStruct> queriesBlackListTypes;
260 std::vector<QueryStruct> queriesBlackListUrls;
261 std::vector<QueryStruct> queriesLinks;
262 std::vector<QueryStruct> queriesLinksBlackListContent;
263 std::vector<QueryStruct> queriesLinksBlackListTypes;
264 std::vector<QueryStruct> queriesLinksBlackListUrls;
265 std::vector<QueryStruct> queriesLinksWhiteListContent;
266 std::vector<QueryStruct> queriesLinksWhiteListTypes;
267 std::vector<QueryStruct> queriesLinksWhiteListUrls;
268 std::vector<QueryStruct> queriesWhiteListContent;
269 std::vector<QueryStruct> queriesWhiteListTypes;
270 std::vector<QueryStruct> queriesWhiteListUrls;
271 std::vector<QueryStruct> queriesTokens;
274 std::vector<QueryStruct> queriesRedirectVars;
279 std::vector<IdString> customPages;
280 std::vector<TimeString> customTokens;
283 std::uint64_t penultimateId{};
285 std::string lockTime;
287 std::size_t manualCounter{};
288 bool startCrawled{
false};
289 bool manualOff{
false};
290 std::string crawledContent;
291 std::uint64_t retryCounter{};
292 bool archiveRetry{
false};
293 std::vector<std::string> mCache;
296 std::uint64_t tickCounter{};
297 std::chrono::steady_clock::time_point startTime{};
298 std::chrono::steady_clock::time_point pauseTime{};
299 std::chrono::steady_clock::time_point idleTime{};
303 std::chrono::steady_clock::time_point httpTime{};
306 std::chrono::time_point<std::chrono::steady_clock> idleStart{};
307 std::uint64_t restore{};
310 void setUpConfig(std::queue<std::string>& warningsTo);
313 void setUpContainer();
314 void setUpDatabase();
315 void setUpTableNames();
316 void setUpSqlStatements();
319 void setUpUriParser();
320 void setUpNetworking();
322 void setUpCustomUrls();
324 void setUpNetworkingArchives();
327 void logWarnings(std::queue<std::string>& warnings);
328 void initCustomUrls();
329 void initRobotsTxt();
330 void initDoGlobalCounting(
331 std::vector<std::string>& urlList,
332 const std::string& variable,
333 const std::string& alias,
337 std::int64_t aliasAdd
339 std::vector<std::string> initDoLocalCounting(
340 const std::string& url,
341 const std::string& variable,
342 const std::string& alias,
346 std::int64_t aliasAdd
348 void initTokenCache();
351 void initQueries()
override;
352 void deleteQueries()
override;
353 void addOptionalQuery(std::uint64_t queryId,
QueryStruct& propertiesTo);
355 const std::vector<std::uint64_t>& queryIds,
356 std::vector<QueryStruct>& propertiesTo
359 std::string_view type,
360 const std::vector<std::string>& names,
361 const std::vector<std::uint64_t>& queryIds,
362 std::vector<QueryStruct>& propertiesTo
367 bool crawlingUrlSelection(IdString& urlTo,
bool& usePostTo);
368 void crawlingUrlSelectionManual(IdString& urlTo,
bool& usePostTo);
369 bool crawlingUrlSelectionAuto(IdString& urlTo);
370 void crawlingUrlSelectionManualRetry(IdString& urlTo,
bool& usePostTo);
371 void crawlingUrlSelectionManualNext(IdString& urlTo,
bool& usePostTo);
372 void crawlingUrlSelectionManualNextCustom(IdString& urlTo,
bool& usePostTo);
373 void crawlingUrlSelectionManualStartPage(IdString& urlTo);
374 bool crawlingUrlSelectionManualLock();
375 void crawlingUrlSelectionAutoStart();
376 bool crawlingUrlSelectionAutoRetry(IdString& urlTo);
377 bool crawlingUrlSelectionAutoLoop(IdString& urlTo);
378 bool crawlingUrlSelectionAutoLock();
379 void crawlingUrl(IdString& url,
bool usePost,
CrawlTimersTick& timers);
381 IdString crawlingReplaceTokens(
const IdString& url);
382 std::string crawlingGetTokenValue(std::size_t index,
const std::string& name);
383 void crawlingUrlParams(std::string& url);
384 bool crawlingContent(
386 const std::string& customCookies,
387 const std::vector<std::string>& customHeaders,
390 std::string& timerStrTo
392 void crawlingDynamicRedirectUrl(
394 std::string& customCookies,
395 std::vector<std::string>& customHeaders,
398 void crawlingDynamicRedirectUrlVars(
const std::string& oldUrl, std::string& strInOut);
399 bool crawlingDynamicRedirectContent(std::string& url, std::string& content);
400 void crawlingDynamicRedirectContentVars(
401 const std::string& oldUrl,
402 std::string& strInOut
404 bool crawlingCheckUrl(
const std::string& url,
const std::string& from);
405 bool crawlingCheckUrlForLinkExtraction(
const std::string& url);
406 bool crawlingCheckCurlCode(CURLcode curlCode,
const std::string& url);
407 bool crawlingCheckResponseCode(
const std::string& url, std::uint32_t responseCode);
408 bool crawlingCheckContentType(
const std::string& url,
const std::string& contentType);
409 bool crawlingCheckContentTypeForLinkExtraction(
410 const std::string& url,
411 const std::string& contentType
413 bool crawlingCheckContent(
const std::string& url);
414 bool crawlingCheckContentForLinkExtraction(
const std::string& url);
415 void crawlingSaveContent(
417 std::uint32_t response,
418 const std::string& type,
419 const std::string& content
421 std::vector<std::string> crawlingExtractUrls(
422 const std::string& url,
423 const std::string& type
425 void crawlingParseAndAddUrls(
426 const std::string& url,
427 std::vector<std::string>& urls,
428 std::size_t& newUrlsTo,
431 bool crawlingArchives(
const IdString& url,
CrawlStatsTick& statsTo,
bool crawlingFailed);
432 void crawlingArchivesDone(
437 const std::string& timerString
439 bool crawlingArchive(
440 std::size_t archiveIndex,
447 bool crawlingArchiveMementoPage(
448 std::size_t archiveIndex,
450 std::string& archivedUrl,
457 bool crawlingArchiveMemento(
458 std::size_t& counter,
460 std::size_t archiveIndex,
462 std::queue<Memento>& mementos,
463 std::string& content,
464 const std::string& statusMessage,
471 bool crawlingArchiveMementoEntry(
472 std::size_t archiveIndex,
475 std::string& timeStamp,
476 std::string& content,
480 bool crawlingArchiveMementoReference(
481 std::size_t archiveIndex,
484 std::string& timeStamp,
487 void crawlingArchiveMementoFetch(
490 std::string& content,
494 void crawlingSuccess(
const IdString& url);
495 void crawlingSkip(
const IdString& url,
bool unlockUrl);
496 void crawlingRetry(
const IdString& url,
bool archiveOnly);
497 void crawlingReset(std::string_view error, std::string_view url);
498 void crawlingResetArchive(
499 std::string_view error,
500 std::string_view url,
501 std::string_view archive
503 void crawlingResetTor();
504 void crawlingUnsetCustom(
bool unsetCookies,
bool unsetHeaders);
505 void crawlingClearMementoCache();
508 static std::string parseMementos(
509 std::string mementoContent,
510 std::queue<std::string>& warningsTo,
511 std::queue<Memento>& mementosTo
Class for TOR control exceptions.
Definition: TorControl.hpp:129
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
constexpr auto httpsIgnoreString
The beginning of a HTTPS URL to be ignored.
Definition: Thread.hpp:122
const NetworkSettings networkOptions
Network settings for the crawler thread.
Definition: Thread.hpp:212
Class for query container exceptions.
Definition: Container.hpp:148
void onClear() override
Clears the crawler.
Definition: Thread.cpp:219
Query container.
Definition: Container.hpp:76
Network settings containing the default proxy as well as host, port, and password of the TOR control ...
Definition: NetworkSettings.hpp:49
constexpr auto httpResponseCodeMax
Maximum HTTP error code.
Definition: Thread.hpp:110
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void onReset() override
Resets the crawler.
Definition: Thread.cpp:265
void onPause() override
Pauses the crawler.
Definition: Thread.cpp:197
constexpr auto robotsMinLineLength
The minimum length of a robots.txt line containing a useful sitemap.
Definition: Thread.hpp:92
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr auto updateCustomUrlCountEvery
The number of custom URLs after which the thread status will be updated.
Definition: Thread.hpp:104
Namespace for crawler classes.
Definition: Config.hpp:44
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Abstract class providing module-independent thread functionality.
Definition: Thread.hpp:93
constexpr auto httpResponseCodeMin
Minimum HTTP error code.
Definition: Thread.hpp:107
constexpr auto archiveRefTimeStampLength
The length of a memento time stamp.
Definition: Thread.hpp:137
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
constexpr auto httpIgnoreString
The beginning of a HTTP URL to be ignored.
Definition: Thread.hpp:128
void onTick() override
Performs a crawler tick.
Definition: Thread.cpp:166
Class for URI exceptions.
Definition: URI.hpp:130
constexpr auto wwwString
The "www." in the beginning of a domain.
Definition: Thread.hpp:116
Timers for crawling tick.
Definition: CrawlTimersTick.hpp:38
Database database
Database connection for the crawler thread.
Definition: Thread.hpp:205
Class for UTF-8 exceptions.
Definition: Utf8.hpp:122
constexpr auto httpsString
The beginning of a URL containing the HTTPS protocol.
Definition: Thread.hpp:119
constexpr auto robotsRelativeUrl
The relative URL of robots.txt.
Definition: Thread.hpp:101
void onUnpause() override
Unpauses the crawler.
Definition: Thread.cpp:206
constexpr auto httpString
The beginning of a URL containing the HTTP protocol.
Definition: Thread.hpp:125
constexpr auto archiveMementoContentType
The content type of a memento.
Definition: Thread.hpp:131
A simple timer.
Definition: Simple.hpp:53
Class for libcurl exceptions.
Definition: Curl.hpp:260
Network::Curl networking
Networking for the crawler thread.
Definition: Thread.hpp:215
Controls a TOR service via a TOR control server/port, if available.
Definition: TorControl.hpp:81
Template class for safe in-scope database locks.
Definition: DatabaseTryLock.hpp:51
Statistics for crawling tick.
Definition: CrawlStatsTick.hpp:38
Class providing database functionality for crawler threads by implementing Wrapper::Database.
Definition: Database.hpp:119
constexpr auto httpResponseCodeIgnore
HTTP response code to be ignored when checking for errors.
Definition: Thread.hpp:113
Provides an interface to the libcurl library for sending and receiving data over the network...
Definition: Curl.hpp:168
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
Network::TorControl torControl
TOR control for the crawler thread.
Definition: Thread.hpp:218
Crawler thread.
Definition: Thread.hpp:149
constexpr auto robotsFirstLetters
The first letters of a robots.txt line containing a sitemap.
Definition: Thread.hpp:95
void onInit() override
Initializes the crawler.
Definition: Thread.cpp:115
Thread(Main::Database &dbBase, std::string_view cookieDirectory, const ThreadOptions &threadOptions, const NetworkSettings &networkSettings, const ThreadStatus &threadStatus)
Constructor initializing a previously interrupted crawler thread.
Definition: Thread.cpp:54
Configuration for crawlers.
Definition: Config.hpp:111
constexpr auto archiveRefString
The reference string in a memento referencing another memento.
Definition: Thread.hpp:134
Timers for crawling content.
Definition: CrawlTimersContent.hpp:38
constexpr auto robotsSitemapBegin
The beginning of a robots.txt line containing a sitemap.
Definition: Thread.hpp:98
Class for date/time exceptions.
Definition: DateTime.hpp:330
Parser for RFC 3986 URIs that can also analyze their relationships with each other.
Definition: URI.hpp:75
Template class for safe in-scope database locks.
Definition: DatabaseLock.hpp:54
constexpr auto archiveRenewUrlLockEveryMs
Number of milliseconds before renewing URL lock while crawling archives.
Definition: Thread.hpp:140
void end()
Waits for the thread until shutdown is completed.
Definition: Thread.cpp:390