crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Database.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Database.hpp
24  *
25  * This class provides database functionality for a parser thread
26  * by implementing the Wrapper::Database interface.
27  *
28  * Created on: Oct 22, 2018
29  * Author: ans
30  */
31 
32 #ifndef MODULE_PARSER_DATABASE_HPP_
33 #define MODULE_PARSER_DATABASE_HPP_
34 
35 #include "../../Helper/Portability/mysqlcppconn.h"
36 #include "../../Main/Exception.hpp"
37 #include "../../Struct/DataEntry.hpp"
38 #include "../../Struct/StatusSetter.hpp"
39 #include "../../Struct/TableColumn.hpp"
40 #include "../../Struct/TargetTableProperties.hpp"
41 #include "../../Wrapper/Database.hpp"
42 
43 #include "Config.hpp"
44 
45 #include <cppconn/exception.h>
46 #include <cppconn/prepared_statement.h>
47 #include <cppconn/resultset.h>
48 #include <cppconn/statement.h>
49 #include <mysql_connection.h>
50 
51 #include <algorithm> // std::count_if
52 #include <chrono> // std::chrono
53 #include <cstddef> // std::size_t
54 #include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
55 #include <memory> // std::unique_ptr
56 #include <queue> // std::queue
57 #include <sstream> // std::ostringstream
58 #include <string> // std::string, std::to_string
59 #include <string_view> // std::string_view, std::string_view_literals
60 #include <utility> // std::pair
61 #include <vector> // std::vector
62 
64 
65  /*
66  * CONSTANTS
67  */
68 
69  using std::string_view_literals::operator""sv;
70 
73 
75  inline constexpr auto maxContentSize{1073741824};
76 
78  inline constexpr auto maxContentSizeString{"1 GiB"sv};
79 
83 
85  inline constexpr auto oneAtOnce{1};
86 
88  inline constexpr auto nAtOnce10{10};
89 
91  inline constexpr auto nAtOnce100{100};
92 
94  inline constexpr auto sqlArg1{1};
95 
97  inline constexpr auto sqlArg2{2};
98 
100  inline constexpr auto sqlArg3{3};
101 
103  inline constexpr auto sqlArg4{4};
104 
106  inline constexpr auto sqlArg5{5};
107 
109  inline constexpr auto sqlArg6{6};
110 
112  inline constexpr auto parsingTableAlias{"a"sv};
113 
115  inline constexpr auto targetTableAlias{"b"sv};
116 
118  inline constexpr auto minTargetColumns{4};
119 
121  inline constexpr auto numArgsLockUrl{3};
122 
124  inline constexpr auto minArsgAddUpdateData{5};
125 
127  inline constexpr auto numArgsFinishUrl{2};
128 
130  inline constexpr auto maxDateTimeValue{"9999-12-31 23:59:59"sv};
131 
133 
134  /*
135  * DECLARATION
136  */
137 
139  class Database final : public Wrapper::Database {
140  // for convenience
145 
146  using IdString = std::pair<std::uint64_t, std::string>;
147  using SqlResultSetPtr = std::unique_ptr<sql::ResultSet>;
148 
149  public:
152 
153  explicit Database(Module::Database& dbThread);
154 
158 
159  // setters
160  void setCacheSize(std::uint64_t setCacheSize);
161  void setMaxBatchSize(std::uint16_t setMaxBatchSize);
162  void setReparse(bool isReparse);
163  void setParseCustom(bool isParseCustom);
164  void setTargetTable(const std::string& table);
165  void setTargetFields(const std::vector<std::string>& fields);
166 
170 
171  void initTargetTable();
172 
176 
177  void prepare();
178 
182 
183  [[nodiscard]] std::string fetchUrls(
184  std::uint64_t lastId,
185  std::queue<IdString>& cache,
186  std::uint32_t lockTimeout
187  );
188  [[nodiscard]] std::uint64_t getUrlPosition(std::uint64_t urlId);
189  [[nodiscard]] std::uint64_t getNumberOfUrls();
190 
194 
195  [[nodiscard]] std::string getLockTime(std::uint32_t lockTimeout);
196  [[nodiscard]] std::string getUrlLockTime(std::uint64_t urlId);
197  [[nodiscard]] std::string renewUrlLockIfOk(
198  std::uint64_t urlId,
199  const std::string& lockTime,
200  std::uint32_t lockTimeout
201  );
202  bool unLockUrlIfOk(std::uint64_t urlId, const std::string& lockTime);
203  void unLockUrlsIfOk(std::queue<IdString>& urls, std::string& lockTime);
204 
208 
209  std::uint32_t checkParsingTable();
210  [[nodiscard]] std::uint64_t getNumberOfContents(std::uint64_t urlId);
211  bool getLatestContent(
212  std::uint64_t urlId,
213  const std::string& lastDateTime,
214  IdString& contentTo,
215  std::string& dateTimeTo
216  );
217  [[nodiscard]] std::queue<IdString> getAllContents(std::uint64_t urlId);
218  [[nodiscard]] std::uint64_t getContentIdFromParsedId(
219  const std::string& parsedId
220  );
221  void updateOrAddEntries(
222  std::queue<DataEntry>& entries,
223  StatusSetter& statusSetter
224  );
225  void setUrlsFinishedIfLockOk(std::queue<IdString>& finished);
226  void updateTargetTable();
227 
229 
232 
233  private:
234  // options
235  std::uint64_t cacheSize{defaultCacheSize};
236  std::uint16_t maxBatchSize{defaultMaxBatchSize};
237  bool reParse{false};
238  bool parseCustom{true};
239  std::string targetTableName;
240  std::vector<std::string> targetFieldNames;
241 
242  // table names and target table ID
243  std::string urlListTable;
244  std::string parsingTable;
245  std::uint64_t targetTableId{};
246  std::string targetTableFull;
247 
248  // IDs of prepared SQL statements
249  struct _ps {
250  std::size_t fetchUrls{};
251  std::size_t lockUrl{};
252  std::size_t lock10Urls{};
253  std::size_t lock100Urls{};
254  std::size_t lockMaxUrls{};
255  std::size_t getUrlPosition{};
256  std::size_t getNumberOfUrls{};
257  std::size_t getLockTime{};
258  std::size_t getUrlLockTime{};
259  std::size_t renewUrlLockIfOk{};
260  std::size_t unLockUrlIfOk{};
261  std::size_t checkParsingTable{};
262  std::size_t getNumberOfContents{};
263  std::size_t getLatestContent{};
264  std::size_t getAllContents{};
265  std::size_t getContentIdFromParsedId{};
266  std::size_t updateOrAddEntry{};
267  std::size_t updateOrAdd10Entries{};
268  std::size_t updateOrAdd100Entries{};
269  std::size_t updateOrAddMaxEntries{};
270  std::size_t setUrlFinishedIfLockOk{};
271  std::size_t set10UrlsFinishedIfLockOk{};
272  std::size_t set100UrlsFinishedIfLockOk{};
273  std::size_t setMaxUrlsFinishedIfLockOk{};
274  std::size_t updateTargetTable{};
275  } ps;
276 
277  // internal helper function
278  bool checkEntrySize(DataEntry& entry);
279  [[nodiscard]] std::string queryLockUrls(std::size_t numberOfUrls);
280  [[nodiscard]] std::string queryUpdateOrAddEntries(std::size_t numberOfEntries);
281  [[nodiscard]] std::string querySetUrlsFinishedIfLockOk(std::size_t numberOfUrls);
282  [[nodiscard]] std::string queryUnlockUrlsIfOk(std::size_t numberOfUrls);
283  };
284 
285 } /* namespace crawlservpp::Module::Parser */
286 
287 #endif /* MODULE_PARSER_DATABASE_HPP_ */
std::string renewUrlLockIfOk(std::uint64_t urlId, const std::string &lockTime, std::uint32_t lockTimeout)
Locks a URL in the database, if it is lockable, or extends its locking time, if it is still locked by...
Definition: Database.cpp:954
constexpr auto parsingTableAlias
Alias, used in SQL queries, for the parsing table.
Definition: Database.hpp:112
constexpr auto oneAtOnce
Process one value at once.
Definition: Database.hpp:85
std::uint64_t getNumberOfUrls()
Gets the number of URLs in the URL list.
Definition: Database.cpp:773
void updateTargetTable()
Updates the target table.
Definition: Database.cpp:1839
A data entry containing either parsed or extracted data.
Definition: DataEntry.hpp:45
void setTargetTable(const std::string &table)
Sets the name of the target table.
Definition: Database.cpp:118
constexpr auto sqlArg2
Second argument in a SQL query.
Definition: Database.hpp:97
void setMaxBatchSize(std::uint16_t setMaxBatchSize)
Sets the maximum number of URLs to be processed at once.
Definition: Database.cpp:72
constexpr auto numArgsFinishUrl
Number of arguments for setting one URL to finished.
Definition: Database.hpp:127
Target table properties containing its type, website, URL list, table names, columns, and compression.
Definition: TargetTableProperties.hpp:44
constexpr auto minArsgAddUpdateData
Minimum number of arguments to add or update a data entry.
Definition: Database.hpp:124
Class handling database access for threads.
Definition: Database.hpp:91
Namespace for parser classes.
Definition: Config.hpp:43
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Class providing database functionality for parser threads by implementing Wrapper::Database.
Definition: Database.hpp:139
constexpr auto minTargetColumns
Minimum number of columns in the target table.
Definition: Database.hpp:118
std::uint32_t checkParsingTable()
Checks the parsing table.
Definition: Database.cpp:1153
void setParseCustom(bool isParseCustom)
Sets whether to parse data from custom URLs.
Definition: Database.cpp:102
bool getLatestContent(std::uint64_t urlId, const std::string &lastDateTime, IdString &contentTo, std::string &dateTimeTo)
Gets crawled content stored in the database for a specific URL.
Definition: Database.cpp:1281
std::uint64_t getNumberOfContents(std::uint64_t urlId)
Gets the number of crawled contents stored for a specific URL from the database.
Definition: Database.cpp:1203
void setReparse(bool isReparse)
Sets whether to re-parse data from already processed URLs.
Definition: Database.cpp:87
Structure for table columns containing its name, type, reference, and indexing.
Definition: TableColumn.hpp:39
constexpr auto sqlArg6
Sixth argument in a SQL query.
Definition: Database.hpp:109
constexpr auto sqlArg3
Third argument in a SQL query.
Definition: Database.hpp:100
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
Wrapper class providing the database functionality of Module::Database to its child classes...
Definition: Database.hpp:72
void initTargetTable()
Creates the target table, if it does not exist, or adds target columns needed by the parser...
Definition: Database.cpp:160
constexpr auto maxDateTimeValue
The maximum value of a DATETIME in the database.
Definition: Database.hpp:130
void setTargetFields(const std::vector< std::string > &fields)
Sets the columns of the target table.
Definition: Database.cpp:136
constexpr auto nAtOnce10
Process ten values at once.
Definition: Database.hpp:88
std::string getUrlLockTime(std::uint64_t urlId)
Gets the current lock expiration time for a URL from the database.
Definition: Database.cpp:888
void unLockUrlsIfOk(std::queue< IdString > &urls, std::string &lockTime)
Unlocks multiple URLs in the database at once.
Definition: Database.cpp:1083
constexpr auto numArgsLockUrl
Number of arguments for locking one URL.
Definition: Database.hpp:121
constexpr auto sqlArg5
Fifth argument in a SQL query.
Definition: Database.hpp:106
void updateOrAddEntries(std::queue< DataEntry > &entries, StatusSetter &statusSetter)
Adds parsed data to the database, or updates data that already exists.
Definition: Database.cpp:1490
constexpr auto maxContentSize
Maximum size of database content (= 1 GiB).
Definition: Database.hpp:75
constexpr auto targetTableAlias
Alias, used in SQL queries, for the target table.
Definition: Database.hpp:115
constexpr auto sqlArg1
First argument in a SQL query.
Definition: Database.hpp:94
std::uint64_t getContentIdFromParsedId(const std::string &parsedId)
Gets the latest content ID from a parsed ID.
Definition: Database.cpp:1428
std::string getLockTime(std::uint32_t lockTimeout)
Gets the current URL lock expiration time from the database.
Definition: Database.cpp:832
constexpr std::uint16_t defaultMaxBatchSize
Default maximum number of URLs to be processed in one MySQL query.
Definition: Config.hpp:77
constexpr std::uint64_t defaultCacheSize
Default cache size.
Definition: Config.hpp:71
std::queue< IdString > getAllContents(std::uint64_t urlId)
Gets all crawled contents stored in the database for a specific URL.
Definition: Database.cpp:1363
std::string fetchUrls(std::uint64_t lastId, std::queue< IdString > &cache, std::uint32_t lockTimeout)
Fetches, locks, and adds the next URLs to the cache, i.e. to the caching queue to be processed...
Definition: Database.cpp:585
constexpr auto sqlArg4
Fourth argument in a SQL query.
Definition: Database.hpp:103
void prepare()
Prepares the SQL statements needed by the parser.
Definition: Database.cpp:225
void setCacheSize(std::uint64_t setCacheSize)
Sets the maximum cache size for URLs.
Definition: Database.cpp:62
bool unLockUrlIfOk(std::uint64_t urlId, const std::string &lockTime)
Unlocks a URL in the database.
Definition: Database.cpp:1022
std::uint64_t getUrlPosition(std::uint64_t urlId)
Gets the position of a URL in the URL list.
Definition: Database.cpp:715
constexpr auto nAtOnce100
Process one hundred values at once.
Definition: Database.hpp:91
void setUrlsFinishedIfLockOk(std::queue< IdString > &finished)
Sets URLs to finished in the database, except those locked by another thread.
Definition: Database.cpp:1744
constexpr auto maxContentSizeString
Maximum size of database content as string.
Definition: Database.hpp:78
Database(Module::Database &dbThread)
Constructor setting the database connection for the thread.
Definition: Database.cpp:45