crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Thread.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Thread.hpp
24  *
25  * Implementation of the Thread interface for extractor threads.
26  *
27  * Created on: May 9, 2019
28  * Author: ans
29  */
30 
31 #ifndef MODULE_EXTRACTOR_THREAD_HPP_
32 #define MODULE_EXTRACTOR_THREAD_HPP_
33 
34 #include "Config.hpp"
35 #include "Database.hpp"
36 
37 #include "../Thread.hpp"
38 
39 #include "../../Helper/CommaLocale.hpp"
40 #include "../../Helper/DateTime.hpp"
41 #include "../../Helper/DotLocale.hpp"
42 #include "../../Helper/Strings.hpp"
43 #include "../../Helper/Utf8.hpp"
44 #include "../../Main/Exception.hpp"
45 #include "../../Network/Curl.hpp"
46 #include "../../Network/TorControl.hpp"
47 #include "../../Query/Container.hpp"
48 #include "../../Struct/DataEntry.hpp"
49 #include "../../Struct/NetworkSettings.hpp"
50 #include "../../Struct/QueryProperties.hpp"
51 #include "../../Struct/QueryStruct.hpp"
52 #include "../../Struct/StatusSetter.hpp"
53 #include "../../Struct/ThreadOptions.hpp"
54 #include "../../Struct/ThreadStatus.hpp"
55 #include "../../Timer/Simple.hpp"
56 
57 #include "../../_extern/jsoncons/include/jsoncons/json.hpp"
58 #include "../../_extern/jsoncons/include/jsoncons_ext/jsonpath/json_query.hpp"
59 #include "../../_extern/rapidjson/include/rapidjson/document.h"
60 
61 #include <algorithm> // std::all_of, std::any_of, std::count_if, std::find, std::none_of
62 #include <chrono> // std::chrono
63 #include <cstddef> // std::size_t
64 #include <cstdint> // std::int64_t, std::uint32_t, std::uint64_t
65 #include <exception> // std::exception
66 #include <iomanip> // std::setprecision
67 #include <ios> // std::fixed
68 #include <queue> // std::queue
69 #include <set> // std::set
70 #include <sstream> // std::ostringstream
71 #include <stdexcept> // std::logic_error
72 #include <string> // std::stoll, std::stoul, std::string, std::to_string
73 #include <string_view> // std::string_view
74 #include <utility> // std::pair
75 #include <vector> // std::vector
76 
78 
79  /*
80  * CONSTANTS
81  */
82 
85 
87  inline constexpr auto httpResponseCodeMin{400};
88 
90  inline constexpr auto httpResponseCodeMax{599};
91 
93  inline constexpr auto httpResponseCodeIgnore{200};
94 
96 
97  /*
98  * DECLARATION
99  */
100 
102  class Thread final : public Module::Thread, public Query::Container, private Config {
103 
104  // for convenience
108 
111 
113 
121 
123 
124  using IdString = std::pair<std::uint64_t, std::string>;
125  using StringString = std::pair<std::string, std::string>;
126 
127  public:
130 
131  Thread(
132  Main::Database& dbBase,
133  std::string_view cookieDirectory,
134  const ThreadOptions& threadOptions,
135  const NetworkSettings& networkSettings,
136  const ThreadStatus& threadStatus
137  );
138 
139  Thread(
140  Main::Database& dbBase,
141  std::string_view cookieDirectory,
142  const ThreadOptions& threadOptions,
143  const NetworkSettings& networkSettings
144  );
145 
147 
150 
151  protected:
154 
157 
161 
164 
167 
171 
173  std::queue<IdString> urls;
174 
176 
179  std::string cacheLockTime;
180 
182  std::queue<DataEntry> results;
183 
185  std::queue<DataEntry> linked;
186 
188 
193  std::set<std::string> ids;
194 
196  std::queue<IdString> finished;
197 
201 
202  void onInit() override;
203  void onTick() override;
204  void onPause() override;
205  void onUnpause() override;
206  void onClear() override;
207  void onReset() override;
208 
210 
211  private:
212  // table names for locking them
213  std::string extractingTable;
214  std::string targetTable;
215  std::string linkedTable;
216 
217  // queries
218  /*
219  * make sure to initialize AND delete them!
220  * -> initQueries(), deleteQueries()
221  */
222  std::vector<QueryStruct> queriesVariables;
223  std::vector<QueryStruct> queriesVariablesSkip;
224  std::vector<QueryStruct> queriesTokens;
225  std::vector<QueryStruct> queriesErrorFail;
226  std::vector<QueryStruct> queriesErrorRetry;
227  std::vector<QueryStruct> queriesDatasets;
228  std::vector<QueryStruct> queriesId;
229  std::vector<QueryStruct> queriesDateTime;
230  std::vector<QueryStruct> queriesFields;
231  std::vector<QueryStruct> queriesRecursive;
232  std::vector<QueryStruct> queriesLinkedDatasets;
233  std::vector<QueryStruct> queriesLinkedId;
234  std::vector<QueryStruct> queriesLinkedFields;
235  QueryStruct queryPagingIsNextFrom;
236  QueryStruct queryPagingNextFrom;
237  QueryStruct queryPagingNumberFrom;
238  QueryStruct queryExpected;
239  QueryStruct queryExtractingSkip;
240 
241  // timing
242  std::uint64_t tickCounter{};
243  std::chrono::steady_clock::time_point startTime{std::chrono::steady_clock::time_point::min()};
244  std::chrono::steady_clock::time_point pauseTime{std::chrono::steady_clock::time_point::min()};
245  std::chrono::steady_clock::time_point idleTime{std::chrono::steady_clock::time_point::min()};
246 
247  // state
248  bool idle{false}; // waiting for new URLs to be crawled
249  std::uint64_t lastUrl{}; // last extracted URL
250  std::string lockTime; // last locking time for currently extracted URL
251 
252  // properties used for progress calculation
253  std::uint64_t idFirst{}; // ID of the first URL fetched
254  std::uint64_t idDist{}; // distance between the IDs of first and last URL fetched
255  float posFirstF{}; // position of the first URL fetched as float
256  std::uint64_t posDist{}; // distance between the positions of first and last URL fetched
257  std::uint64_t total{}; // number of total URLs in URL list
258 
259  // initializing functions
260  void setUpConfig(std::queue<std::string>& warningsTo);
261  void checkQueries();
262  void setUpLogging();
263  void setUpContainer();
264  void setUpDatabase();
265  void setUpSources();
266  void setUpTableNames();
267  void setUpTarget();
268  void setUpSqlStatements();
269  void setUpNetworking();
270  void setUpTor();
271  void setUpQueries();
272  void checkExtractingTable();
273  void setUpTimers();
274  void ready();
275  void logWarnings(std::queue<std::string>& warnings);
276  void logWarningsUrl(std::queue<std::string>& warnings);
277  void logWarningsSource(std::queue<std::string>& warnings, std::string_view source);
278 
279  // query functions
280  void initQueries() override;
281  void deleteQueries() override;
282  void addOptionalQuery(std::uint64_t queryId, QueryStruct& propertiesTo);
283  void addQueries(
284  const std::vector<std::uint64_t>& queryIds,
285  std::vector<QueryStruct>& propertiesTo
286  );
287  void addQueriesTo(
288  const std::vector<std::uint64_t>& queryIds,
289  std::vector<QueryStruct>& propertiesTo
290  );
291  void addQueriesTo(
292  std::string_view type,
293  const std::vector<std::string>& names,
294  const std::vector<std::uint64_t>& queryIds,
295  std::vector<QueryStruct>& propertiesTo
296  );
297 
298  // extracting functions
299  void extractingUrlSelection();
300  void extractingFetchUrls();
301  void extractingCheckUrls();
302  std::size_t extractingNext();
303  void extractingGetVariableValues(std::vector<StringString>& variables);
304  bool extractingIsSkip(const std::vector<StringString>& variables);
305  void extractingGetTokenValues(std::vector<StringString>& variables);
306  void extractingGetPageTokenValues(
307  const std::string& page,
308  std::vector<StringString>& tokens,
309  const std::vector<StringString>& variables
310  );
311  std::string extractingGetTokenValue(
312  const std::string& name,
313  const std::string& source,
314  const std::string& setCookies,
315  const std::vector<std::string>& setHeaders,
316  bool usePost,
317  const QueryStruct& query
318  );
319  void extractingPageContent(
320  const std::string& url,
321  const std::string& setCookies,
322  const std::vector<std::string>& setHeaders,
323  std::string& resultTo
324  );
325  void extractingGetValueFromContent(const QueryStruct& query, std::string& resultTo);
326  void extractingGetValueFromUrl(const QueryStruct& query, std::string& resultTo);
327  bool extractingPageIsSkip(std::queue<std::string>& queryWarningsTo);
328  bool extractingPageIsRetry(std::queue<std::string>& queryWarningsTo);
329  std::size_t extractingPage(std::uint64_t contentId, const std::string& url);
330  std::size_t extractingLinked(std::uint64_t contentId, const std::string& url);
331  bool extractingCheckCurlCode(CURLcode curlCode, const std::string& url);
332  bool extractingCheckResponseCode(const std::string& url, std::uint32_t responseCode);
333  void extractingUrlFinished(bool success);
334  void extractingSaveLinked();
335  void extractingSaveResults(bool warped);
336  void extractingReset(std::string_view error, std::string_view source);
337  void extractingResetTor();
338  void extractingUnset(
339  const std::string& unsetCookies,
340  const std::vector<std::string>& unsetHeaders
341  );
342  void extractingFieldWarning(
343  std::string_view error,
344  std::string_view name,
345  std::string_view url,
346  bool isLinked
347  );
348 
349  // shadow functions not to be used by the thread
350  void pause();
351  void start();
352  void unpause();
353  void stop();
354  void interrupt();
355  };
356 
357 } /* namespace crawlservpp::Module::Extractor */
358 
359 #endif /* MODULE_EXTRACTOR_THREAD_HPP_ */
Class for TOR control exceptions.
Definition: TorControl.hpp:129
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
A data entry containing either parsed or extracted data.
Definition: DataEntry.hpp:45
std::queue< IdString > finished
Queue of URLs in the cache that have been finished.
Definition: Thread.hpp:196
std::queue< DataEntry > linked
Linked data that has not yet been written to the database.
Definition: Thread.hpp:185
Class for query container exceptions.
Definition: Container.hpp:148
Query container.
Definition: Container.hpp:76
Network settings containing the default proxy as well as host, port, and password of the TOR control ...
Definition: NetworkSettings.hpp:49
void onInit() override
Initializes the extractor.
Definition: Thread.cpp:109
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void onClear() override
Clears the extractor.
Definition: Thread.cpp:367
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Network::Curl networking
Networking for the extractor thread.
Definition: Thread.hpp:163
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
constexpr auto httpResponseCodeIgnore
HTTP response code to be ignored when checking for errors.
Definition: Thread.hpp:93
Abstract class providing module-independent thread functionality.
Definition: Thread.hpp:93
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
constexpr auto httpResponseCodeMax
Maximum HTTP error code.
Definition: Thread.hpp:90
Class for UTF-8 exceptions.
Definition: Utf8.hpp:122
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::queue< IdString > urls
Queue of URLs in the cache to still be processed, and their IDs.
Definition: Thread.hpp:173
Class for libcurl exceptions.
Definition: Curl.hpp:260
Class for date/time locale exception.
Definition: DateTime.hpp:337
std::string cacheLockTime
The time until which the URLs in the cache are locked, as string.
Definition: Thread.hpp:179
Controls a TOR service via a TOR control server/port, if available.
Definition: TorControl.hpp:81
void onReset() override
Resets the extractor.
Definition: Thread.cpp:429
Provides an interface to the libcurl library for sending and receiving data over the network...
Definition: Curl.hpp:168
void onPause() override
Pauses the extractor.
Definition: Thread.cpp:342
Configuration for extractors.
Definition: Config.hpp:137
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
Network::TorControl torControl
TOR control for the crawler thread.
Definition: Thread.hpp:166
void onUnpause() override
Unpauses the extractor.
Definition: Thread.cpp:354
constexpr auto httpResponseCodeMin
Minimum HTTP error code.
Definition: Thread.hpp:87
Namespace for extractor classes.
Definition: Config.hpp:44
void onTick() override
Performs an extractor tick.
Definition: Thread.cpp:152
Thread(Main::Database &dbBase, std::string_view cookieDirectory, const ThreadOptions &threadOptions, const NetworkSettings &networkSettings, const ThreadStatus &threadStatus)
Constructor initializing a previously interrupted extractor thread.
Definition: Thread.cpp:54
std::set< std::string > ids
ID cache.
Definition: Thread.hpp:193
Class for date/time exceptions.
Definition: DateTime.hpp:330
Template class for safe in-scope database locks.
Definition: DatabaseLock.hpp:54
Extractor thread.
Definition: Thread.hpp:102
Database database
Database connection for the extractor thread.
Definition: Thread.hpp:156
std::queue< DataEntry > results
Extracted data that has not yet been written to the database.
Definition: Thread.hpp:182
Class providing database functionality for extractor threads by implementing Wrapper::Database.
Definition: Database.hpp:162