crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Thread.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Thread.hpp
24  *
25  * Implementation of the Thread interface for crawler threads.
26  *
27  * Created on: Oct 11, 2018
28  * Author: ans
29  */
30 
31 #ifndef MODULE_CRAWLER_THREAD_HPP_
32 #define MODULE_CRAWLER_THREAD_HPP_
33 
34 #include "Config.hpp"
35 #include "Database.hpp"
36 
37 #include "../Thread.hpp"
38 
39 #include "../../Helper/CommaLocale.hpp"
40 #include "../../Helper/Container.hpp"
41 #include "../../Helper/DateTime.hpp"
42 #include "../../Helper/DotLocale.hpp"
43 #include "../../Helper/Strings.hpp"
44 #include "../../Helper/Utf8.hpp"
45 #include "../../Main/Exception.hpp"
46 #include "../../Network/Curl.hpp"
47 #include "../../Network/TorControl.hpp"
48 #include "../../Parsing/URI.hpp"
49 #include "../../Query/Container.hpp"
50 #include "../../Struct/CrawlStatsTick.hpp"
51 #include "../../Struct/CrawlTimersTick.hpp"
52 #include "../../Struct/CrawlTimersContent.hpp"
53 #include "../../Struct/NetworkSettings.hpp"
54 #include "../../Struct/QueryProperties.hpp"
55 #include "../../Struct/QueryStruct.hpp"
56 #include "../../Struct/ThreadOptions.hpp"
57 #include "../../Struct/ThreadStatus.hpp"
58 #include "../../Timer/Simple.hpp"
59 #include "../../Wrapper/DatabaseLock.hpp"
60 #include "../../Wrapper/DatabaseTryLock.hpp"
61 
62 #include <curl/curl.h>
63 
64 #include <algorithm> // std::count, std::find_if, std::min, std::remove_if, std::transform
65 #include <cctype> // std::tolower
66 #include <chrono> // std::chrono
67 #include <cstddef> // std::size_t
68 #include <cstdint> // std::int64_t, std::uint32_t, std::uint64_t
69 #include <iomanip> // std::setprecision
70 #include <ios> // std::fixed
71 #include <memory> // std::make_unique, std::unique_ptr
72 #include <queue> // std::queue
73 #include <sstream> // std::istringstream, std::ostringstream
74 #include <stdexcept> // std::logic_error
75 #include <string> // std::getline, std::stoul, std::string, std::to_string
76 #include <string_view> // std::string_view, std::string_view_literals
77 #include <utility> // std::pair
78 #include <vector> // std::vector
79 
81 
82  /*
83  * CONSTANTS
84  */
85 
86  using std::string_view_literals::operator""sv;
87 
90 
92  inline constexpr auto robotsMinLineLength{9};
93 
95  inline constexpr auto robotsFirstLetters{7};
96 
98  inline constexpr auto robotsSitemapBegin{"sitemap:"sv};
99 
101  inline constexpr auto robotsRelativeUrl{"/robots.txt"sv};
102 
104  inline constexpr auto updateCustomUrlCountEvery{100};
105 
107  inline constexpr auto httpResponseCodeMin{400};
108 
110  inline constexpr auto httpResponseCodeMax{599};
111 
113  inline constexpr auto httpResponseCodeIgnore{200};
114 
116  inline constexpr auto wwwString{"www."sv};
117 
119  inline constexpr auto httpsString{"https://"sv};
120 
122  inline constexpr auto httpsIgnoreString{"https://www."sv};
123 
125  inline constexpr auto httpString{"http://"sv};
126 
128  inline constexpr auto httpIgnoreString{"http://www."sv};
129 
131  inline constexpr auto archiveMementoContentType{"application/link-format"sv};
132 
134  inline constexpr auto archiveRefString{"found capture at "sv};
135 
137  inline constexpr auto archiveRefTimeStampLength{14};
138 
140  inline constexpr auto archiveRenewUrlLockEveryMs{1000};
141 
143 
144  /*
145  * DECLARATION
146  */
147 
149  class Thread final : public Module::Thread, public Query::Container, private Config {
150  // for convenience
153 
156 
158 
160 
169 
172 
173  using IdString = std::pair<std::uint64_t, std::string>;
174  using TimeString = std::pair<std::chrono::steady_clock::time_point, std::string>;
175 
176  public:
179 
180  Thread(
181  Main::Database& dbBase,
182  std::string_view cookieDirectory,
183  const ThreadOptions& threadOptions,
184  const NetworkSettings& networkSettings,
185  const ThreadStatus& threadStatus
186  );
187 
188  Thread(
189  Main::Database& dbBase,
190  std::string_view cookieDirectory,
191  const ThreadOptions& threadOptions,
192  const NetworkSettings& networkSettings
193  );
194 
196 
199 
200  protected:
203 
206 
210 
213 
216 
219 
223 
224  void onInit() override;
225  void onTick() override;
226  void onPause() override;
227  void onUnpause() override;
228  void onClear() override;
229  void onReset() override;
230 
232 
233  private:
234  // structure for mementos (archived versions of websites)
235  struct Memento {
236  std::string url;
237  std::string timeStamp;
238  };
239 
240  // constant for cookie directory
241  const std::string_view cookieDir;
242 
243  // table names for locking them
244  std::string urlListTable;
245  std::string crawlingTable;
246 
247  // domain, URI parser and (optional) separate networking for archives
248  std::string domain;
249  bool noSubDomain{false};
250  Parsing::URI uriParser;
251  std::unique_ptr<Network::Curl> networkingArchives;
252 
253  // queries
254  /*
255  * make sure to initialize AND delete them!
256  * -> initQueries(), deleteQueries()
257  */
258  std::vector<QueryStruct> queriesBlackListContent;
259  std::vector<QueryStruct> queriesBlackListTypes;
260  std::vector<QueryStruct> queriesBlackListUrls;
261  std::vector<QueryStruct> queriesLinks;
262  std::vector<QueryStruct> queriesLinksBlackListContent;
263  std::vector<QueryStruct> queriesLinksBlackListTypes;
264  std::vector<QueryStruct> queriesLinksBlackListUrls;
265  std::vector<QueryStruct> queriesLinksWhiteListContent;
266  std::vector<QueryStruct> queriesLinksWhiteListTypes;
267  std::vector<QueryStruct> queriesLinksWhiteListUrls;
268  std::vector<QueryStruct> queriesWhiteListContent;
269  std::vector<QueryStruct> queriesWhiteListTypes;
270  std::vector<QueryStruct> queriesWhiteListUrls;
271  std::vector<QueryStruct> queriesTokens;
272  QueryStruct queryRedirectContent;
273  QueryStruct queryRedirectUrl;
274  std::vector<QueryStruct> queriesRedirectVars;
275  QueryStruct queryExpected;
276 
277  // custom URLs
278  IdString startPage;
279  std::vector<IdString> customPages;
280  std::vector<TimeString> customTokens;
281 
282  // crawling state
283  std::uint64_t penultimateId{}; // penultimate ID (last ID saved in parent class)
284  IdString nextUrl; // next URL (currently crawled URL in automatic mode)
285  std::string lockTime; // last locking time for currently crawled URL
286  IdString manualUrl; // custom URL to be retried
287  std::size_t manualCounter{}; // number of crawled custom URLs
288  bool startCrawled{false}; // start page has been successfully crawled
289  bool manualOff{false}; // manual mode has been turned off (after first URL is crawled)
290  std::string crawledContent; // crawled content
291  std::uint64_t retryCounter{}; // number of retries so far
292  bool archiveRetry{false}; // only archive needs to be retried
293  std::vector<std::string> mCache;// cache of processed mementos (to be skipped on retry)
294 
295  // timing
296  std::uint64_t tickCounter{};
297  std::chrono::steady_clock::time_point startTime{};
298  std::chrono::steady_clock::time_point pauseTime{};
299  std::chrono::steady_clock::time_point idleTime{};
300  /*
301  * time of last HTTP request – only used when HTTP sleep is enabled
302  */
303  std::chrono::steady_clock::time_point httpTime{};
304 
305  // restart timer and URL to restore after re-crawling the manual URLs
306  std::chrono::time_point<std::chrono::steady_clock> idleStart{};
307  std::uint64_t restore{};
308 
309  // initializing functions
310  void setUpConfig(std::queue<std::string>& warningsTo);
311  void checkQuery();
312  void setUpLogging();
313  void setUpContainer();
314  void setUpDatabase();
315  void setUpTableNames();
316  void setUpSqlStatements();
317  void checkUrlList();
318  void setUpDomain();
319  void setUpUriParser();
320  void setUpNetworking();
321  void setUpTor();
322  void setUpCustomUrls();
323  void setUpQueries();
324  void setUpNetworkingArchives();
325  void setUpTimers();
326  void ready();
327  void logWarnings(std::queue<std::string>& warnings);
328  void initCustomUrls();
329  void initRobotsTxt();
330  void initDoGlobalCounting(
331  std::vector<std::string>& urlList,
332  const std::string& variable,
333  const std::string& alias,
334  std::int64_t start,
335  std::int64_t end,
336  std::int64_t step,
337  std::int64_t aliasAdd
338  );
339  std::vector<std::string> initDoLocalCounting(
340  const std::string& url,
341  const std::string& variable,
342  const std::string& alias,
343  std::int64_t start,
344  std::int64_t end,
345  std::int64_t step,
346  std::int64_t aliasAdd
347  );
348  void initTokenCache();
349 
350  // query functions
351  void initQueries() override;
352  void deleteQueries() override;
353  void addOptionalQuery(std::uint64_t queryId, QueryStruct& propertiesTo);
354  void addQueries(
355  const std::vector<std::uint64_t>& queryIds,
356  std::vector<QueryStruct>& propertiesTo
357  );
358  void addQueriesTo(
359  std::string_view type,
360  const std::vector<std::string>& names,
361  const std::vector<std::uint64_t>& queryIds,
362  std::vector<QueryStruct>& propertiesTo
363  );
364 
365  // crawling functions
366  void crawlingJump();
367  bool crawlingUrlSelection(IdString& urlTo, bool& usePostTo);
368  void crawlingUrlSelectionManual(IdString& urlTo, bool& usePostTo);
369  bool crawlingUrlSelectionAuto(IdString& urlTo);
370  void crawlingUrlSelectionManualRetry(IdString& urlTo, bool& usePostTo);
371  void crawlingUrlSelectionManualNext(IdString& urlTo, bool& usePostTo);
372  void crawlingUrlSelectionManualNextCustom(IdString& urlTo, bool& usePostTo);
373  void crawlingUrlSelectionManualStartPage(IdString& urlTo);
374  bool crawlingUrlSelectionManualLock();
375  void crawlingUrlSelectionAutoStart();
376  bool crawlingUrlSelectionAutoRetry(IdString& urlTo);
377  bool crawlingUrlSelectionAutoLoop(IdString& urlTo);
378  bool crawlingUrlSelectionAutoLock();
379  void crawlingUrl(IdString& url, bool usePost, CrawlTimersTick& timers);
380  void crawlingWait();
381  IdString crawlingReplaceTokens(const IdString& url);
382  std::string crawlingGetTokenValue(std::size_t index, const std::string& name);
383  void crawlingUrlParams(std::string& url);
384  bool crawlingContent(
385  IdString& url,
386  const std::string& customCookies,
387  const std::vector<std::string>& customHeaders,
388  bool usePost,
389  CrawlStatsTick& statsTo,
390  std::string& timerStrTo
391  );
392  void crawlingDynamicRedirectUrl(
393  std::string& url,
394  std::string& customCookies,
395  std::vector<std::string>& customHeaders,
396  bool& usePost
397  );
398  void crawlingDynamicRedirectUrlVars(const std::string& oldUrl, std::string& strInOut);
399  bool crawlingDynamicRedirectContent(std::string& url, std::string& content);
400  void crawlingDynamicRedirectContentVars(
401  const std::string& oldUrl,
402  std::string& strInOut
403  );
404  bool crawlingCheckUrl(const std::string& url, const std::string& from);
405  bool crawlingCheckUrlForLinkExtraction(const std::string& url);
406  bool crawlingCheckCurlCode(CURLcode curlCode, const std::string& url);
407  bool crawlingCheckResponseCode(const std::string& url, std::uint32_t responseCode);
408  bool crawlingCheckContentType(const std::string& url, const std::string& contentType);
409  bool crawlingCheckContentTypeForLinkExtraction(
410  const std::string& url,
411  const std::string& contentType
412  );
413  bool crawlingCheckContent(const std::string& url);
414  bool crawlingCheckContentForLinkExtraction(const std::string& url);
415  void crawlingSaveContent(
416  const IdString& url,
417  std::uint32_t response,
418  const std::string& type,
419  const std::string& content
420  );
421  std::vector<std::string> crawlingExtractUrls(
422  const std::string& url,
423  const std::string& type
424  );
425  void crawlingParseAndAddUrls(
426  const std::string& url,
427  std::vector<std::string>& urls,
428  std::size_t& newUrlsTo,
429  bool archived
430  );
431  bool crawlingArchives(const IdString& url, CrawlStatsTick& statsTo, bool crawlingFailed);
432  void crawlingArchivesDone(
433  const IdString& url,
434  CrawlTimersTick& timers,
435  const CrawlStatsTick& stats,
436  bool crawlingFailed,
437  const std::string& timerString
438  );
439  bool crawlingArchive(
440  std::size_t archiveIndex,
441  const IdString& url,
442  bool crawlingFailed,
443  CrawlStatsTick& statsTo,
444  bool& successTo,
445  Timer::Simple& timer
446  );
447  bool crawlingArchiveMementoPage(
448  std::size_t archiveIndex,
449  const IdString& url,
450  std::string& archivedUrl,
451  bool crawlingFailed,
452  CrawlStatsTick& statsTo,
453  bool& successTo,
454  bool& fatalTo,
455  Timer::Simple& timer
456  );
457  bool crawlingArchiveMemento(
458  std::size_t& counter,
459  std::size_t total,
460  std::size_t archiveIndex,
461  const IdString& url,
462  std::queue<Memento>& mementos,
463  std::string& content,
464  const std::string& statusMessage,
465  CrawlStatsTick& statsTo,
466  bool& successTo,
467  bool& skipTo,
468  bool& fatalTo,
469  Timer::Simple& timer
470  );
471  bool crawlingArchiveMementoEntry(
472  std::size_t archiveIndex,
473  const IdString& url,
474  Memento& memento,
475  std::string& timeStamp,
476  std::string& content,
477  CrawlStatsTick& statsTo,
478  bool& fatalTo
479  );
480  bool crawlingArchiveMementoReference(
481  std::size_t archiveIndex,
482  const IdString& url,
483  Memento& memento,
484  std::string& timeStamp,
485  std::string& content
486  );
487  void crawlingArchiveMementoFetch(
488  const IdString& url,
489  Memento& memento,
490  std::string& content,
491  CrawlStatsTick& statsTo
492  );
493 
494  void crawlingSuccess(const IdString& url);
495  void crawlingSkip(const IdString& url, bool unlockUrl);
496  void crawlingRetry(const IdString& url, bool archiveOnly);
497  void crawlingReset(std::string_view error, std::string_view url);
498  void crawlingResetArchive(
499  std::string_view error,
500  std::string_view url,
501  std::string_view archive
502  );
503  void crawlingResetTor();
504  void crawlingUnsetCustom(bool unsetCookies, bool unsetHeaders);
505  void crawlingClearMementoCache();
506 
507  // internal static helper function
508  static std::string parseMementos(
509  std::string mementoContent,
510  std::queue<std::string>& warningsTo,
511  std::queue<Memento>& mementosTo
512  );
513 
514  // shadow functions not to be used by the thread
515  void pause();
516  void start();
517  void unpause();
518  void stop();
519  void interrupt();
520  };
521 
522 } /* namespace crawlservpp::Module::Crawler */
523 
524 #endif /* MODULE_CRAWLER_THREAD_HPP_ */
Class for TOR control exceptions.
Definition: TorControl.hpp:129
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
constexpr auto httpsIgnoreString
The beginning of a HTTPS URL to be ignored.
Definition: Thread.hpp:122
const NetworkSettings networkOptions
Network settings for the crawler thread.
Definition: Thread.hpp:212
Class for query container exceptions.
Definition: Container.hpp:148
void onClear() override
Clears the crawler.
Definition: Thread.cpp:219
Query container.
Definition: Container.hpp:76
Network settings containing the default proxy as well as host, port, and password of the TOR control ...
Definition: NetworkSettings.hpp:49
constexpr auto httpResponseCodeMax
Maximum HTTP error code.
Definition: Thread.hpp:110
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void onReset() override
Resets the crawler.
Definition: Thread.cpp:265
void onPause() override
Pauses the crawler.
Definition: Thread.cpp:197
constexpr auto robotsMinLineLength
The minimum length of a robots.txt line containing a useful sitemap.
Definition: Thread.hpp:92
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr auto updateCustomUrlCountEvery
The number of custom URLs after which the thread status will be updated.
Definition: Thread.hpp:104
Namespace for crawler classes.
Definition: Config.hpp:44
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Abstract class providing module-independent thread functionality.
Definition: Thread.hpp:93
constexpr auto httpResponseCodeMin
Minimum HTTP error code.
Definition: Thread.hpp:107
constexpr auto archiveRefTimeStampLength
The length of a memento time stamp.
Definition: Thread.hpp:137
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
constexpr auto httpIgnoreString
The beginning of a HTTP URL to be ignored.
Definition: Thread.hpp:128
void onTick() override
Performs a crawler tick.
Definition: Thread.cpp:166
Class for URI exceptions.
Definition: URI.hpp:130
constexpr auto wwwString
The "www." in the beginning of a domain.
Definition: Thread.hpp:116
Timers for crawling tick.
Definition: CrawlTimersTick.hpp:38
Database database
Database connection for the crawler thread.
Definition: Thread.hpp:205
Class for UTF-8 exceptions.
Definition: Utf8.hpp:122
constexpr auto httpsString
The beginning of a URL containing the HTTPS protocol.
Definition: Thread.hpp:119
constexpr auto robotsRelativeUrl
The relative URL of robots.txt.
Definition: Thread.hpp:101
void onUnpause() override
Unpauses the crawler.
Definition: Thread.cpp:206
constexpr auto httpString
The beginning of a URL containing the HTTP protocol.
Definition: Thread.hpp:125
constexpr auto archiveMementoContentType
The content type of a memento.
Definition: Thread.hpp:131
A simple timer.
Definition: Simple.hpp:53
Class for libcurl exceptions.
Definition: Curl.hpp:260
Network::Curl networking
Networking for the crawler thread.
Definition: Thread.hpp:215
Controls a TOR service via a TOR control server/port, if available.
Definition: TorControl.hpp:81
Template class for safe in-scope database locks.
Definition: DatabaseTryLock.hpp:51
Statistics for crawling tick.
Definition: CrawlStatsTick.hpp:38
Class providing database functionality for crawler threads by implementing Wrapper::Database.
Definition: Database.hpp:119
constexpr auto httpResponseCodeIgnore
HTTP response code to be ignored when checking for errors.
Definition: Thread.hpp:113
Provides an interface to the libcurl library for sending and receiving data over the network...
Definition: Curl.hpp:168
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
Network::TorControl torControl
TOR control for the crawler thread.
Definition: Thread.hpp:218
Crawler thread.
Definition: Thread.hpp:149
constexpr auto robotsFirstLetters
The first letters of a robots.txt line containing a sitemap.
Definition: Thread.hpp:95
void onInit() override
Initializes the crawler.
Definition: Thread.cpp:115
Thread(Main::Database &dbBase, std::string_view cookieDirectory, const ThreadOptions &threadOptions, const NetworkSettings &networkSettings, const ThreadStatus &threadStatus)
Constructor initializing a previously interrupted crawler thread.
Definition: Thread.cpp:54
Configuration for crawlers.
Definition: Config.hpp:111
constexpr auto archiveRefString
The reference string in a memento referencing another memento.
Definition: Thread.hpp:134
Timers for crawling content.
Definition: CrawlTimersContent.hpp:38
constexpr auto robotsSitemapBegin
The beginning of a robots.txt line containing a sitemap.
Definition: Thread.hpp:98
Class for date/time exceptions.
Definition: DateTime.hpp:330
Parser for RFC 3986 URIs that can also analyze their relationships with each other.
Definition: URI.hpp:75
Template class for safe in-scope database locks.
Definition: DatabaseLock.hpp:54
constexpr auto archiveRenewUrlLockEveryMs
Number of milliseconds before renewing URL lock while crawling archives.
Definition: Thread.hpp:140
void end()
Waits for the thread until shutdown is completed.
Definition: Thread.cpp:390