crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Thread.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Thread.hpp
24  *
25  * Implementation of the Thread interface for parser threads.
26  *
27  * Created on: Oct 11, 2018
28  * Author: ans
29  */
30 
31 #ifndef MODULE_PARSER_THREAD_HPP_
32 #define MODULE_PARSER_THREAD_HPP_
33 
34 #include "Config.hpp"
35 #include "Database.hpp"
36 
37 #include "../Thread.hpp"
38 
39 #include "../../Helper/CommaLocale.hpp"
40 #include "../../Helper/DateTime.hpp"
41 #include "../../Helper/DotLocale.hpp"
42 #include "../../Helper/Json.hpp"
43 #include "../../Helper/Strings.hpp"
44 #include "../../Main/Exception.hpp"
45 #include "../../Query/Container.hpp"
46 #include "../../Struct/DataEntry.hpp"
47 #include "../../Struct/QueryProperties.hpp"
48 #include "../../Struct/QueryStruct.hpp"
49 #include "../../Struct/StatusSetter.hpp"
50 #include "../../Struct/ThreadOptions.hpp"
51 #include "../../Struct/ThreadStatus.hpp"
52 #include "../../Timer/Simple.hpp"
53 
54 #include <algorithm> // std::find, std::find_if
55 #include <chrono> // std::chrono
56 #include <cstddef> // std::size_t
57 #include <cstdint> // std::uint8_t, std::uint64_t
58 #include <iomanip> // std::setprecision
59 #include <ios> // std::fixed
60 #include <queue> // std::queue
61 #include <sstream> // std::ostringstream
62 #include <stdexcept> // std::logic_error, std::runtime_error
63 #include <string> // std::string, std::to_string
64 #include <string_view> // std::string_view
65 #include <utility> // std::pair
66 #include <vector> // std::vector
67 
69 
70  /*
71  * CONSTANTS
72  */
73 
76 
78  inline constexpr std::uint8_t updateContentCounterEvery{25};
79 
81 
82  /*
83  * DECLARATION
84  */
85 
87  class Thread final : public Module::Thread, public Query::Container, private Config {
88  // for convenience
92 
99 
101 
103 
104  using IdString = std::pair<std::uint64_t, std::string>;
105 
106  public:
109 
110  Thread(
111  Main::Database& dbBase,
112  const ThreadOptions& threadOptions,
113  const ThreadStatus& threadStatus
114  );
115 
116  Thread(
117  Main::Database& dbBase,
118  const ThreadOptions& threadOptions
119  );
120 
122 
125 
126  protected:
129 
132 
136 
138  std::queue<IdString> urls;
139 
141 
144  std::string cacheLockTime;
145 
147  std::queue<DataEntry> results;
148 
150  std::queue<IdString> finished;
151 
155 
156  void onInit() override;
157  void onTick() override;
158  void onPause() override;
159  void onUnpause() override;
160  void onClear() override;
161  void onReset() override;
162 
164 
165  private:
166  // table names for locking them
167  std::string parsingTable;
168  std::string targetTable;
169 
170  // queries
171  /*
172  * make sure to initialize AND delete them!
173  * -> initQueries(), deleteQueries()
174  */
175  std::vector<QueryStruct> queriesSkip;
176  std::vector<QueryStruct> queriesContentIgnore;
177  std::vector<QueryStruct> queriesId;
178  std::vector<QueryStruct> queriesDateTime;
179  std::vector<QueryStruct> queriesFields;
180 
181  // timing
182  std::uint64_t tickCounter{};
183  std::chrono::steady_clock::time_point startTime{std::chrono::steady_clock::time_point::min()};
184  std::chrono::steady_clock::time_point pauseTime{std::chrono::steady_clock::time_point::min()};
185  std::chrono::steady_clock::time_point idleTime{std::chrono::steady_clock::time_point::min()};
186 
187  // parsing state
188  bool idle{false}; // waiting for new URLs to be crawled
189  bool idFromUrlOnly{false}; // ID is exclusively parsed from URL
190  std::uint64_t lastUrl{}; // last URL
191  std::string lockTime; // last locking time for currently parsed URL
192 
193  // properties used for progress calculation
194  std::uint64_t idFirst{}; // ID of the first URL fetched
195  std::uint64_t idDist{}; // distance between the IDs of first and last URL fetched
196  float posFirstF{}; // position of the first URL fetched as float
197  std::uint64_t posDist{}; // distance between the positions of first and last URL fetched
198  std::uint64_t total{}; // number of total URLs in URL list
199 
200  // initializing functions
201  void setUpConfig(std::queue<std::string>& warningsTo);
202  void setUpLogging();
203  void setUpContainer();
204  void setUpDatabase();
205  void setUpTableNames();
206  void setUpTarget();
207  void setUpSqlStatements();
208  void setUpQueries();
209  void checkParsingTable();
210  void setUpTimers();
211  void ready();
212  void logWarnings(std::queue<std::string>& warnings);
213 
214  // query functions
215  void initQueries() override;
216  void deleteQueries() override;
217  void addQueries(
218  const std::vector<std::uint64_t>& queryIds,
219  std::vector<QueryStruct>& propertiesTo
220  );
221  void addQueriesTo(
222  const std::vector<std::uint64_t>& queryIds,
223  std::vector<QueryStruct>& propertiesTo
224  );
225  void addQueriesTo(
226  std::string_view type,
227  const std::vector<std::string>& names,
228  const std::vector<std::uint64_t>& queryIds,
229  std::vector<QueryStruct>& propertiesTo
230  );
231 
232  // parsing functions
233  void parsingUrlSelection();
234  void parsingFetchUrls();
235  void parsingCheckUrls();
236  std::size_t parsingNext();
237  bool parsingContent(const IdString& content, std::string_view parsedId);
238  void parsingUrlFinished(bool success);
239  void parsingSaveResults(bool warped);
240  void parsingFieldWarning(
241  std::string_view error,
242  std::string_view name,
243  std::string_view url
244  );
245 
246  // shadow functions not to be used by the thread
247  void pause();
248  void start();
249  void unpause();
250  void stop();
251  void interrupt();
252  };
253 
254 } /* namespace crawlservpp::Module::Parser */
255 
256 #endif /* MODULE_PARSER_THREAD_HPP_ */
void onReset() override
Resets the parser.
Definition: Thread.cpp:398
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
A data entry containing either parsed or extracted data.
Definition: DataEntry.hpp:45
Parser thread.
Definition: Thread.hpp:87
Class for query container exceptions.
Definition: Container.hpp:148
Query container.
Definition: Container.hpp:76
std::queue< DataEntry > results
Parsed data that has not yet been written to the database.
Definition: Thread.hpp:147
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
Namespace for parser classes.
Definition: Config.hpp:43
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Configuration for parsers.
Definition: Config.hpp:92
Class providing database functionality for parser threads by implementing Wrapper::Database.
Definition: Database.hpp:139
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Abstract class providing module-independent thread functionality.
Definition: Thread.hpp:93
Class for JSON exceptions.
Definition: Json.hpp:136
void onUnpause() override
Unpauses the parser.
Definition: Thread.cpp:319
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Thread(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Constructor initializing a previously interrupted parser thread.
Definition: Thread.cpp:50
void onInit() override
Initializes the parser.
Definition: Thread.cpp:83
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
Class for date/time locale exception.
Definition: DateTime.hpp:337
std::queue< IdString > finished
Queue of URLs in the cache that have been finished.
Definition: Thread.hpp:150
void onPause() override
Pauses the parser.
Definition: Thread.cpp:307
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
std::queue< IdString > urls
Queue of URLs in the cache to still be processed, and their IDs.
Definition: Thread.hpp:138
std::string cacheLockTime
The time until which the URLs in the cache are locked, as string.
Definition: Thread.hpp:144
void onClear() override
Clears the parser.
Definition: Thread.cpp:332
void onTick() override
Performs a parser tick.
Definition: Thread.cpp:120
Class for date/time exceptions.
Definition: DateTime.hpp:330
Template class for safe in-scope database locks.
Definition: DatabaseLock.hpp:54
Database database
Database connection for the parser thread.
Definition: Thread.hpp:131
constexpr std::uint8_t updateContentCounterEvery
The number of processed contents after which the thread status will be updated.
Definition: Thread.hpp:78