crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Database.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Database.hpp
24  *
25  * This class provides database functionality for an extractor thread
26  * by implementing the Wrapper::Database interface.
27  *
28  * Created on: May 9, 2019
29  * Author: ans
30  */
31 
32 #ifndef MODULE_EXTRACTOR_DATABASE_HPP_
33 #define MODULE_EXTRACTOR_DATABASE_HPP_
34 
35 #include "Config.hpp"
36 
37 #include "../../Helper/CommaLocale.hpp"
38 #include "../../Main/Exception.hpp"
39 #include "../../Struct/DataEntry.hpp"
40 #include "../../Struct/StatusSetter.hpp"
41 #include "../../Struct/TableColumn.hpp"
42 #include "../../Struct/TargetTableProperties.hpp"
43 #include "../../Wrapper/Database.hpp"
44 
45 #include "../../Helper/Portability/mysqlcppconn.h"
46 
47 #include <cppconn/exception.h>
48 #include <cppconn/prepared_statement.h>
49 #include <cppconn/resultset.h>
50 #include <cppconn/statement.h>
51 #include <mysql_connection.h>
52 
53 #include <algorithm> // std::count_if, std::find_if
54 #include <chrono> // std::chrono
55 #include <cstddef> // std::size_t
56 #include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
57 #include <memory> // std::unique_ptr
58 #include <queue> // std::queue
59 #include <sstream> // std::ostringstream
60 #include <string> // std::string, std::to_string
61 #include <string_view> // std::string_view, std::string_view_literals
62 #include <utility> // std::pair
63 #include <vector> // std::vector
64 
66 
67  /*
68  * CONSTANTS
69  */
70 
71  using std::string_view_literals::operator""sv;
72 
75 
77  inline constexpr auto minTargetColumns{4};
78 
80  inline constexpr auto minLinkedColumns{2};
81 
83  inline constexpr auto maxContentSize{1073741824};
84 
86  inline constexpr auto maxContentSizeString{"1 GiB"sv};
87 
91 
93  inline constexpr auto oneAtOnce{1};
94 
96  inline constexpr auto nAtOnce10{10};
97 
99  inline constexpr auto nAtOnce100{100};
100 
102  inline constexpr auto sqlArg1{1};
103 
105  inline constexpr auto sqlArg2{2};
106 
108  inline constexpr auto sqlArg3{3};
109 
111  inline constexpr auto sqlArg4{4};
112 
114  inline constexpr auto sqlArg5{5};
115 
117  inline constexpr auto extractingTableAlias{"a"sv};
118 
120  inline constexpr auto targetTableAlias{"b"sv};
121 
123  inline constexpr auto linkedTableAlias{"c"sv};
124 
126  inline constexpr auto parsedDataTableAlias{"a"sv};
127 
129  inline constexpr auto crawledDataTableAlias{"b"sv};
130 
132  inline constexpr auto urlListTableAlias{"c"sv};
133 
135  inline constexpr auto numArgsLockUrl{3};
136 
138  inline constexpr auto numArgsAddUpdateData{4};
139 
141  inline constexpr auto numArgsLinked{2};
142 
144  inline constexpr auto numArgsOverwriteData{3};
145 
147  inline constexpr auto numArgsAddUpdateLinkedData{2};
148 
150  inline constexpr auto numArgsOverwriteLinkedData{2};
151 
153  inline constexpr auto numArgsFinishUrl{2};
154 
156 
157  /*
158  * DECLARATION
159  */
160 
162  class Database final : public Wrapper::Database {
163  // for convenience
168 
169  using IdString = std::pair<std::uint64_t, std::string>;
170  using StringString = std::pair<std::string, std::string>;
171  using SqlResultSetPtr = std::unique_ptr<sql::ResultSet>;
172 
173  public:
176 
177  explicit Database(Module::Database& dbThread);
178 
182 
183  void setCacheSize(std::uint64_t setCacheSize);
184  void setMaxBatchSize(std::uint16_t setMaxBatchSize);
185  void setReExtract(bool isReExtract);
186  void setExtractCustom(bool isExtractCustom);
187  void setRawContentIsSource(bool isRawContentIsSource);
188  void setSources(std::queue<StringString>& tablesAndColumns);
189  void setTargetTable(const std::string& table);
190  void setTargetFields(const std::vector<std::string>& fields);
191  void setLinkedTable(const std::string& table);
192  void setLinkedField(const std::string& field);
193  void setLinkedFields(const std::vector<std::string>& fields);
194  void setOverwrite(bool isOverwrite);
195  void setOverwriteLinked(bool isOverwrite);
196 
200 
201  void initTargetTables();
202 
206 
207  void prepare();
208 
212 
213  [[nodiscard]] std::string fetchUrls(std::uint64_t lastId, std::queue<IdString>& cache, std::uint32_t lockTimeout);
214  [[nodiscard]] std::uint64_t getUrlPosition(std::uint64_t urlId);
215  [[nodiscard]] std::uint64_t getNumberOfUrls();
216 
220 
221  [[nodiscard]] std::string getLockTime(std::uint32_t lockTimeout);
222  [[nodiscard]] std::string getUrlLockTime(std::uint64_t urlId);
223  [[nodiscard]] std::string renewUrlLockIfOk(std::uint64_t urlId, const std::string& lockTime, std::uint32_t lockTimeout);
224  bool unLockUrlIfOk(std::uint64_t urlId, const std::string& lockTime);
225  void unLockUrlsIfOk(std::queue<IdString>& urls, std::string& lockTime);
226 
230 
231  std::uint32_t checkExtractingTable();
232  bool getContent(std::uint64_t urlId, IdString& contentTo);
233  void getLatestParsedData(std::uint64_t urlId, std::size_t sourceIndex, std::string& resultTo);
234  void updateOrAddEntries(std::queue<DataEntry>& entries, StatusSetter& statusSetter);
235  void updateOrAddLinked(std::queue<DataEntry>& entries, StatusSetter& statusSetter);
236  void setUrlsFinishedIfLockOk(std::queue<IdString>& finished);
237  void updateTargetTable();
238 
241 
242  private:
243  // options
244  std::uint64_t cacheSize{defaultCacheSize};
245  std::uint16_t maxBatchSize{defaultMaxBatchSize};
246  bool reExtract{false};
247  bool extractCustom{false};
248  std::string targetTableName;
249  std::string linkedTableName;
250  std::vector<std::string> targetFieldNames;
251  std::vector<std::string> linkedFieldNames;
252  bool overwrite{true};
253  bool overwriteLinked{true};
254  bool linked{false};
255 
256  // sources
257  bool rawContentIsSource{false};
258  std::queue<StringString> sources;
259 
260  // table names, target table IDs, and linked field
261  std::string urlListTable;
262  std::string extractingTable;
263  std::uint64_t targetTableId{};
264  std::uint64_t linkedTableId{};
265  std::string targetTableFull;
266  std::string linkedTableFull;
267  std::string linkedField;
268  std::uint64_t linkedIndex{};
269 
270  // IDs of prepared SQL statements
271  struct _ps {
272  std::size_t fetchUrls{};
273  std::size_t lockUrl{};
274  std::size_t lock10Urls{};
275  std::size_t lock100Urls{};
276  std::size_t lockMaxUrls{};
277  std::size_t getUrlPosition{};
278  std::size_t getNumberOfUrls{};
279  std::size_t getLockTime{};
280  std::size_t getUrlLockTime{};
281  std::size_t renewUrlLockIfOk{};
282  std::size_t unLockUrlIfOk{};
283  std::size_t checkExtractingTable{};
284  std::size_t getContent{};
285  std::size_t updateOrAddEntry{};
286  std::size_t updateOrAddLinked{};
287  std::size_t updateOrAdd10Entries{};
288  std::size_t updateOrAdd10Linked{};
289  std::size_t updateOrAdd100Entries{};
290  std::size_t updateOrAdd100Linked{};
291  std::size_t updateOrAddMaxEntries{};
292  std::size_t updateOrAddMaxLinked{};
293  std::size_t setUrlFinishedIfLockOk{};
294  std::size_t set10UrlsFinishedIfLockOk{};
295  std::size_t set100UrlsFinishedIfLockOk{};
296  std::size_t setMaxUrlsFinishedIfLockOk{};
297  std::size_t updateTargetTable{};
298  } ps;
299 
300  // prepared SQL statements for getting parsed data
301  std::vector<std::size_t> psGetLatestParsedData;
302 
303  // internal helper functions
304  bool checkEntrySize(DataEntry& entry);
305  [[nodiscard]] std::string queryLockUrls(std::size_t numberOfUrls);
306  [[nodiscard]] std::string queryUpdateOrAddEntries(std::size_t numberOfEntries);
307  [[nodiscard]] std::string queryUpdateOrAddLinked(std::size_t numberOfEntries);
308  [[nodiscard]] std::string querySetUrlsFinishedIfLockOk(std::size_t numberOfUrls);
309  [[nodiscard]] std::string queryUnlockUrlsIfOk(std::size_t numberOfUrls);
310  };
311 
312 } /* namespace crawlservpp::Module::Extractor */
313 
314 #endif /* MODULE_EXTRACTOR_DATABASE_HPP_ */
void getLatestParsedData(std::uint64_t urlId, std::size_t sourceIndex, std::string &resultTo)
Gets parsed data from the given source stored in the database for a specific URL. ...
Definition: Database.cpp:1604
void setExtractCustom(bool isExtractCustom)
Sets whether to extract data from custom URLs.
Definition: Database.cpp:103
bool getContent(std::uint64_t urlId, IdString &contentTo)
Gets the latest content stored in the database for a specific URL.
Definition: Database.cpp:1528
A data entry containing either parsed or extracted data.
Definition: DataEntry.hpp:45
void updateOrAddEntries(std::queue< DataEntry > &entries, StatusSetter &statusSetter)
Adds extracted data to the database, or updates data that already exists.
Definition: Database.cpp:1675
constexpr auto crawledDataTableAlias
Alias, used in SQL queries, for the crawled data table.
Definition: Database.hpp:129
constexpr auto urlListTableAlias
Alias, used in SQL queries, for the URL list table.
Definition: Database.hpp:132
constexpr auto minLinkedColumns
Minimum number of columns in the linked target table.
Definition: Database.hpp:80
void setOverwriteLinked(bool isOverwrite)
Sets whether existing linked datasets with the same ID will be overwritten.
Definition: Database.cpp:252
constexpr auto sqlArg2
Second argument in a SQL query.
Definition: Database.hpp:105
void unLockUrlsIfOk(std::queue< IdString > &urls, std::string &lockTime)
Unlocks multiple URLs in the database at once.
Definition: Database.cpp:1401
constexpr auto numArgsFinishUrl
Number of arguments to set a URL to finished.
Definition: Database.hpp:153
void setReExtract(bool isReExtract)
Sets whether to re-extract data from already processed URLs.
Definition: Database.cpp:88
std::string getUrlLockTime(std::uint64_t urlId)
Gets the current lock expiration time for a URL from the database.
Definition: Database.cpp:1198
Target table properties containing its type, website, URL list, table names, columns, and compression.
Definition: TargetTableProperties.hpp:44
void updateTargetTable()
Updates the target table.
Definition: Database.cpp:2416
constexpr auto numArgsAddUpdateLinkedData
Number of arguments to add or update one linked data entry.
Definition: Database.hpp:147
constexpr auto numArgsLinked
Number of additional arguments when data is linked.
Definition: Database.hpp:141
Class handling database access for threads.
Definition: Database.hpp:91
void setCacheSize(std::uint64_t setCacheSize)
Sets the maximum cache size for URLs.
Definition: Database.cpp:62
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::uint64_t getUrlPosition(std::uint64_t urlId)
Gets the position of a URL in the URL list.
Definition: Database.cpp:1021
constexpr auto numArgsAddUpdateData
Number of arguments to add or update one data entry (without custom columns).
Definition: Database.hpp:138
void setMaxBatchSize(std::uint16_t setMaxBatchSize)
Sets the maximum number of URLs and results to be processed at once.
Definition: Database.cpp:73
constexpr auto parsedDataTableAlias
Alias, used in SQL queries, for the parsed data table.
Definition: Database.hpp:126
void setLinkedFields(const std::vector< std::string > &fields)
Sets the columns of the linked table.
Definition: Database.cpp:222
void initTargetTables()
Creates the target table, if it does not exist, or adds target columns needed by the extractor...
Definition: Database.cpp:286
void setLinkedField(const std::string &field)
Sets the mname of the linked field.
Definition: Database.cpp:206
void setTargetFields(const std::vector< std::string > &fields)
Sets the columns of the target table.
Definition: Database.cpp:171
constexpr auto nAtOnce100
Process one hundred values at once.
Definition: Database.hpp:99
constexpr auto sqlArg5
Fifth argument in a SQL query.
Definition: Database.hpp:114
constexpr auto maxContentSize
Maximum size of database content (= 1 GiB).
Definition: Database.hpp:83
constexpr auto numArgsLockUrl
Number of arguments to lock one URL.
Definition: Database.hpp:135
void updateOrAddLinked(std::queue< DataEntry > &entries, StatusSetter &statusSetter)
Adds linked data to the database, or updates data that already exists.
Definition: Database.cpp:2060
constexpr auto numArgsOverwriteData
Number of additional arguments when overwriting existing data.
Definition: Database.hpp:144
std::string renewUrlLockIfOk(std::uint64_t urlId, const std::string &lockTime, std::uint32_t lockTimeout)
Locks a URL in the database, if it is lockable, or extends its locking time, if it is still locked by...
Definition: Database.cpp:1268
Database(Module::Database &dbThread)
Constructor setting the database connection for the thread.
Definition: Database.cpp:45
constexpr auto nAtOnce10
Process ten values at once.
Definition: Database.hpp:96
Structure for table columns containing its name, type, reference, and indexing.
Definition: TableColumn.hpp:39
constexpr auto sqlArg1
First argument in a SQL query.
Definition: Database.hpp:102
constexpr auto targetTableAlias
Alias, used in SQL queries, for the target table.
Definition: Database.hpp:120
bool unLockUrlIfOk(std::uint64_t urlId, const std::string &lockTime)
Unlocks a URL in the database.
Definition: Database.cpp:1340
void prepare()
Prepares the SQL statements needed by the extractor.
Definition: Database.cpp:430
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
Wrapper class providing the database functionality of Module::Database to its child classes...
Definition: Database.hpp:72
void setOverwrite(bool isOverwrite)
Sets whether existing datasets with the same ID will be overwritten.
Definition: Database.cpp:237
std::uint64_t getNumberOfUrls()
Gets the number of URLs in the URL list.
Definition: Database.cpp:1083
constexpr auto linkedTableAlias
Alias, used in SQL queries, for the linked target table.
Definition: Database.hpp:123
void setLinkedTable(const std::string &table)
Sets the name of the linked table.
Definition: Database.cpp:187
constexpr std::uint16_t defaultMaxBatchSize
Default number of URLs and results to be processed in one MySQL query.
Definition: Config.hpp:98
constexpr auto oneAtOnce
Process one value at once.
Definition: Database.hpp:93
constexpr std::uint64_t defaultCacheSize
Default cache size.
Definition: Config.hpp:92
void setTargetTable(const std::string &table)
Sets the name of the target table.
Definition: Database.cpp:153
constexpr auto minTargetColumns
Minimum number of columns in the target table.
Definition: Database.hpp:77
constexpr auto sqlArg4
Fourth argument in a SQL query.
Definition: Database.hpp:111
void setRawContentIsSource(bool isRawContentIsSource)
Sets whether raw crawled data is used as source for the data to be extracted.
Definition: Database.cpp:118
std::string fetchUrls(std::uint64_t lastId, std::queue< IdString > &cache, std::uint32_t lockTimeout)
Fetches, locks, and adds the next URLs to the cache, i.e. to the caching queue to be processed...
Definition: Database.cpp:885
constexpr auto maxContentSizeString
Maximum size of database content as string.
Definition: Database.hpp:86
constexpr auto sqlArg3
Third argument in a SQL query.
Definition: Database.hpp:108
Namespace for extractor classes.
Definition: Config.hpp:44
std::string getLockTime(std::uint32_t lockTimeout)
Gets the current URL lock expiration time from the database.
Definition: Database.cpp:1142
constexpr auto numArgsOverwriteLinkedData
Number of additional arguments when overwriting existing linked data.
Definition: Database.hpp:150
constexpr auto extractingTableAlias
Alias, used in SQL queries, for the extracting table.
Definition: Database.hpp:117
std::uint32_t checkExtractingTable()
Checks the extracting table.
Definition: Database.cpp:1471
void setSources(std::queue< StringString > &tablesAndColumns)
Sets the tables and columns of the parsed data sources.
Definition: Database.cpp:137
void setUrlsFinishedIfLockOk(std::queue< IdString > &finished)
Sets URLs to finished in the database, except those locked by another thread.
Definition: Database.cpp:2317
Class providing database functionality for extractor threads by implementing Wrapper::Database.
Definition: Database.hpp:162