31 #ifndef MODULE_EXTRACTOR_THREAD_HPP_ 32 #define MODULE_EXTRACTOR_THREAD_HPP_ 37 #include "../Thread.hpp" 39 #include "../../Helper/CommaLocale.hpp" 40 #include "../../Helper/DateTime.hpp" 41 #include "../../Helper/DotLocale.hpp" 42 #include "../../Helper/Strings.hpp" 43 #include "../../Helper/Utf8.hpp" 44 #include "../../Main/Exception.hpp" 45 #include "../../Network/Curl.hpp" 46 #include "../../Network/TorControl.hpp" 47 #include "../../Query/Container.hpp" 48 #include "../../Struct/DataEntry.hpp" 49 #include "../../Struct/NetworkSettings.hpp" 50 #include "../../Struct/QueryProperties.hpp" 51 #include "../../Struct/QueryStruct.hpp" 52 #include "../../Struct/StatusSetter.hpp" 53 #include "../../Struct/ThreadOptions.hpp" 54 #include "../../Struct/ThreadStatus.hpp" 55 #include "../../Timer/Simple.hpp" 57 #include "../../_extern/jsoncons/include/jsoncons/json.hpp" 58 #include "../../_extern/jsoncons/include/jsoncons_ext/jsonpath/json_query.hpp" 59 #include "../../_extern/rapidjson/include/rapidjson/document.h" 73 #include <string_view> 124 using IdString = std::pair<std::uint64_t, std::string>;
125 using StringString = std::pair<std::string, std::string>;
133 std::string_view cookieDirectory,
141 std::string_view cookieDirectory,
193 std::set<std::string>
ids;
213 std::string extractingTable;
214 std::string targetTable;
215 std::string linkedTable;
222 std::vector<QueryStruct> queriesVariables;
223 std::vector<QueryStruct> queriesVariablesSkip;
224 std::vector<QueryStruct> queriesTokens;
225 std::vector<QueryStruct> queriesErrorFail;
226 std::vector<QueryStruct> queriesErrorRetry;
227 std::vector<QueryStruct> queriesDatasets;
228 std::vector<QueryStruct> queriesId;
229 std::vector<QueryStruct> queriesDateTime;
230 std::vector<QueryStruct> queriesFields;
231 std::vector<QueryStruct> queriesRecursive;
232 std::vector<QueryStruct> queriesLinkedDatasets;
233 std::vector<QueryStruct> queriesLinkedId;
234 std::vector<QueryStruct> queriesLinkedFields;
242 std::uint64_t tickCounter{};
243 std::chrono::steady_clock::time_point startTime{std::chrono::steady_clock::time_point::min()};
244 std::chrono::steady_clock::time_point pauseTime{std::chrono::steady_clock::time_point::min()};
245 std::chrono::steady_clock::time_point idleTime{std::chrono::steady_clock::time_point::min()};
249 std::uint64_t lastUrl{};
250 std::string lockTime;
253 std::uint64_t idFirst{};
254 std::uint64_t idDist{};
256 std::uint64_t posDist{};
257 std::uint64_t total{};
260 void setUpConfig(std::queue<std::string>& warningsTo);
263 void setUpContainer();
264 void setUpDatabase();
266 void setUpTableNames();
268 void setUpSqlStatements();
269 void setUpNetworking();
272 void checkExtractingTable();
275 void logWarnings(std::queue<std::string>& warnings);
276 void logWarningsUrl(std::queue<std::string>& warnings);
277 void logWarningsSource(std::queue<std::string>& warnings, std::string_view source);
280 void initQueries()
override;
281 void deleteQueries()
override;
282 void addOptionalQuery(std::uint64_t queryId,
QueryStruct& propertiesTo);
284 const std::vector<std::uint64_t>& queryIds,
285 std::vector<QueryStruct>& propertiesTo
288 const std::vector<std::uint64_t>& queryIds,
289 std::vector<QueryStruct>& propertiesTo
292 std::string_view type,
293 const std::vector<std::string>& names,
294 const std::vector<std::uint64_t>& queryIds,
295 std::vector<QueryStruct>& propertiesTo
299 void extractingUrlSelection();
300 void extractingFetchUrls();
301 void extractingCheckUrls();
302 std::size_t extractingNext();
303 void extractingGetVariableValues(std::vector<StringString>& variables);
304 bool extractingIsSkip(
const std::vector<StringString>& variables);
305 void extractingGetTokenValues(std::vector<StringString>& variables);
306 void extractingGetPageTokenValues(
307 const std::string& page,
308 std::vector<StringString>& tokens,
309 const std::vector<StringString>& variables
311 std::string extractingGetTokenValue(
312 const std::string& name,
313 const std::string& source,
314 const std::string& setCookies,
315 const std::vector<std::string>& setHeaders,
319 void extractingPageContent(
320 const std::string& url,
321 const std::string& setCookies,
322 const std::vector<std::string>& setHeaders,
323 std::string& resultTo
325 void extractingGetValueFromContent(
const QueryStruct& query, std::string& resultTo);
326 void extractingGetValueFromUrl(
const QueryStruct& query, std::string& resultTo);
327 bool extractingPageIsSkip(std::queue<std::string>& queryWarningsTo);
328 bool extractingPageIsRetry(std::queue<std::string>& queryWarningsTo);
329 std::size_t extractingPage(std::uint64_t contentId,
const std::string& url);
330 std::size_t extractingLinked(std::uint64_t contentId,
const std::string& url);
331 bool extractingCheckCurlCode(CURLcode curlCode,
const std::string& url);
332 bool extractingCheckResponseCode(
const std::string& url, std::uint32_t responseCode);
333 void extractingUrlFinished(
bool success);
334 void extractingSaveLinked();
335 void extractingSaveResults(
bool warped);
336 void extractingReset(std::string_view error, std::string_view source);
337 void extractingResetTor();
338 void extractingUnset(
339 const std::string& unsetCookies,
340 const std::vector<std::string>& unsetHeaders
342 void extractingFieldWarning(
343 std::string_view error,
344 std::string_view name,
345 std::string_view url,
Class for TOR control exceptions.
Definition: TorControl.hpp:129
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
A data entry containing either parsed or extracted data.
Definition: DataEntry.hpp:45
Class for query container exceptions.
Definition: Container.hpp:148
Query container.
Definition: Container.hpp:76
Network settings containing the default proxy as well as host, port, and password of the TOR control ...
Definition: NetworkSettings.hpp:49
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Abstract class providing module-independent thread functionality.
Definition: Thread.hpp:93
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for UTF-8 exceptions.
Definition: Utf8.hpp:122
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
Class for libcurl exceptions.
Definition: Curl.hpp:260
Class for date/time locale exception.
Definition: DateTime.hpp:337
Controls a TOR service via a TOR control server/port, if available.
Definition: TorControl.hpp:81
Provides an interface to the libcurl library for sending and receiving data over the network...
Definition: Curl.hpp:168
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
Class for date/time exceptions.
Definition: DateTime.hpp:330
Template class for safe in-scope database locks.
Definition: DatabaseLock.hpp:54