crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Database.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Database.hpp
24  *
25  * This class provides database functionality for analyzer threads
26  * by implementing the Wrapper::Database interface.
27  *
28  * Created on: Oct 22, 2018
29  * Author: ans
30  */
31 
32 #ifndef MODULE_ANALYZER_DATABASE_HPP_
33 #define MODULE_ANALYZER_DATABASE_HPP_
34 
35 #include "Config.hpp"
36 
37 #include "../../Data/Corpus.hpp"
38 #include "../../Data/Data.hpp"
39 #include "../../Helper/CommaLocale.hpp"
40 #include "../../Helper/Container.hpp"
41 #include "../../Helper/Json.hpp"
42 #include "../../Helper/Memory.hpp"
43 #include "../../Helper/Portability/mysqlcppconn.h"
44 #include "../../Main/Exception.hpp"
45 #include "../../Struct/CorpusProperties.hpp"
46 #include "../../Struct/StatusSetter.hpp"
47 #include "../../Struct/TableColumn.hpp"
48 #include "../../Struct/TargetTableProperties.hpp"
49 #include "../../Struct/TextMap.hpp"
50 #include "../../Timer/Simple.hpp"
51 #include "../../Wrapper/Database.hpp"
52 #include "../../Wrapper/DatabaseLock.hpp"
53 
54 #include "../../_extern/rapidjson/include/rapidjson/document.h"
55 
56 #include <cppconn/exception.h>
57 #include <cppconn/prepared_statement.h>
58 #include <cppconn/resultset.h>
59 #include <cppconn/statement.h>
60 #include <mysql_connection.h>
61 
62 #include <algorithm> // std::all_of, std::sort, std::upper_bound
63 #include <cstddef> // std::size_t
64 #include <cstdint> // std::uint8_t, std::uint16_t, std::uint64_t
65 #include <functional> // std::function
66 #include <queue> // std::queue
67 #include <sstream> // std::ostringstream
68 #include <string> // std::string, std::to_string
69 #include <unordered_map> // std::unordered_map
70 #include <utility> // std::move, std::pair
71 #include <vector> // std::vector
72 
74 
75  /*
76  * CONSTANTS
77  */
78 
81 
83  inline constexpr auto defaultCorpusSlicing{30};
84 
86  inline constexpr auto corpusSlicingFactor{1.F / 100};
87 
89  inline constexpr auto maxNumCorpusColumns{3};
90 
92  inline constexpr auto progressDeletedCorpus{0.05F};
93 
95  inline constexpr auto progressReceivedSources{0.35F};
96 
98  inline constexpr auto progressMovedData{0.4F};
99 
101  inline constexpr auto progressCreatedCorpus{0.6F};
102 
104  inline constexpr auto progressSlicedCorpus{0.65F};
105 
107  inline constexpr auto progressAddingCorpus{
109  };
110 
112  inline constexpr auto progressReceivedCorpus{0.8F};
113 
115  inline constexpr auto progressGeneratedSavePoint{0.1F};
116 
118  inline constexpr auto progressSavingSavePoint{
120  };
121 
125 
127  inline constexpr auto sqlArg1{1};
128 
130  inline constexpr auto sqlArg2{2};
131 
133  inline constexpr auto sqlArg3{3};
134 
136  inline constexpr auto sqlArg4{4};
137 
139  inline constexpr auto sqlArg5{5};
140 
142  inline constexpr auto sqlArg6{6};
143 
145  inline constexpr auto sqlArg7{7};
146 
148  inline constexpr auto sqlArg8{8};
149 
151  inline constexpr auto sqlArg9{9};
152 
154  inline constexpr auto sqlArg10{10};
155 
157  inline constexpr auto sqlArg11{11};
158 
160  inline constexpr auto sqlArg12{12};
161 
165 
167  inline constexpr auto column1{0};
168 
170  inline constexpr auto column2{1};
171 
173  inline constexpr auto column3{2};
174 
176  inline constexpr auto numColumns1{1};
177 
179  inline constexpr auto numColumns2{2};
180 
182 
183  /*
184  * DECLARATION
185  */
186 
188  class Database final : public Wrapper::Database {
189  // for convenience
191 
192  using DataType = Data::Type;
193 
198  using TextMap = Struct::TextMap;
199 
201 
202  using IsRunningCallback = std::function<bool()>;
203  using SentenceMap = std::vector<std::pair<std::size_t, std::size_t>>;
204  using SqlResultSetPtr = std::unique_ptr<sql::ResultSet>;
205  using StringString = std::pair<std::string, std::string>;
206 
207  public:
210 
211  explicit Database(Module::Database& dbThread);
212 
216 
217  void setTargetTable(const std::string& table);
218  void setTargetFields(const std::vector<StringString>& fields);
219  void setCorpusSlicing(std::uint8_t percentageOfMaxAllowedPackageSize);
220  void setIsRunningCallback(const IsRunningCallback& isRunningCallback);
221 
225 
226  void initTargetTable(bool isCompressed, bool isDelete);
227  void updateTargetTable();
228  [[nodiscard]] std::string getTargetTableUpdated();
229 
233 
234  std::size_t addAdditionalTable(
235  const std::string& name,
236  const std::vector<StringString>& fields,
237  bool isCompressed,
238  bool isDelete
239  );
240  const std::string& getAdditionalTableName(std::size_t id) const;
241  void updateAdditionalTable(std::size_t id);
242 
246 
247  void prepare();
248 
252 
253  [[nodiscard]] bool getCorpus(
254  const CorpusProperties& corpusProperties,
255  const std::string& filterDateFrom,
256  const std::string& filterDateTo,
257  Data::Corpus& corpusTo,
258  std::size_t& sourcesTo,
259  StatusSetter& statusSetter
260  );
261  [[nodiscard]] std::string getCorporaLastUpdated() const;
262 
266 
267  [[nodiscard]] std::string getSourceTableName(std::uint16_t type, const std::string& name) const;
268  [[nodiscard]] static std::string getSourceColumnName(std::uint16_t type, const std::string& name);
269  void checkSources(
270  std::vector<std::uint8_t>& types,
271  std::vector<std::string>& tables,
272  std::vector<std::string>& columns
273  );
274 
276 
279 
280  protected:
283 
285  std::string tablePrefix;
286 
288  std::string targetTableName;
289 
291  std::uint64_t targetTableId{};
292 
294  std::string targetTableFull;
295 
297  std::vector<StringString> targetFields;
298 
300 
305 
307  std::unordered_map<std::size_t, std::string> additionalTables;
308 
309  private:
310  // IDs of prepared SQL statements
311  struct _ps {
312  std::size_t getCorpusInfo{};
313  std::size_t checkCorpusSavePoint{};
314  std::size_t getCorpusFirst{};
315  std::size_t getCorpusSavePoint{};
316  std::size_t getCorpusNext{};
317  std::size_t isCorpusChanged{};
318  std::size_t isCorpusChangedParsing{};
319  std::size_t isCorpusChangedExtracting{};
320  std::size_t isCorpusChangedAnalyzing{};
321  std::size_t deleteCorpus{};
322  std::size_t addChunkContinuous{};
323  std::size_t addChunkTokenized{};
324  std::size_t measureChunk{};
325  std::size_t measureCorpus{};
326  std::size_t updateTargetTable{};
327  std::size_t getTargetTableUpdated{};
328  std::size_t updateAdditionalTable{};
329  } ps;
330 
331  // function for checking whether the parent thread is still running
332  IsRunningCallback isRunning;
333 
334  // last update date/time over all corpus sources
335  std::string corporaLastUpdated;
336 
337  // internal helper function
338  bool checkSource(
339  std::uint16_t type,
340  const std::string& table,
341  const std::string& column
342  );
343 
344  // internal corpus functions
345  [[nodiscard]] bool corpusIsChanged(
346  const CorpusProperties& properties
347  );
348  void corpusCreate(
349  const CorpusProperties& properties,
350  Data::Corpus& corpusTo,
351  std::size_t& sourcesTo,
352  StatusSetter& statusSetter
353  );
354  void corpusLoad(
355  CorpusProperties& properties,
356  Data::Corpus& corpusTo,
357  std::size_t& sourcesTo,
358  StatusSetter& statusSetter
359  );
360  [[nodiscard]] std::string corpusFindSavePoint(
361  CorpusProperties& properties,
362  const std::string& corpusCreationTime
363  );
364  [[nodiscard]] bool corpusManipulate(
365  const CorpusProperties& properties,
366  Data::Corpus& corpusRef,
367  std::size_t numSources,
368  StatusSetter& statusSetter
369  );
370  void corpusSaveSavePoint(
371  const CorpusProperties& properties,
372  const Data::Corpus& corpus,
373  std::size_t numSources,
374  const std::string& savePoint,
375  StatusSetter& statusSetter
376  );
377  };
378 
379 } /* namespace crawlservpp::Module::Analyzer */
380 
381 #endif /* MODULE_ANALYZER_DATABASE_HPP_ */
constexpr auto sqlArg12
Twelfth argument in a SQL query.
Definition: Database.hpp:160
std::string getSourceTableName(std::uint16_t type, const std::string &name) const
Public helper function getting the full name of a source table.
Definition: Database.cpp:1051
std::uint64_t targetTableId
The ID of the target table to be written to.
Definition: Database.hpp:291
constexpr auto column2
Second column in a table.
Definition: Database.hpp:170
Namespace for analyzer classes.
constexpr auto corpusSlicingFactor
The factor used for corpus slicing percentage points (1/100).
Definition: Database.hpp:86
std::string targetTableFull
The full name of the target table to be written to, including prefixes.
Definition: Database.hpp:294
constexpr auto maxNumCorpusColumns
The maximum number of columns used when creating a text corpus.
Definition: Database.hpp:89
std::string getCorporaLastUpdated() const
Gets the last update date/time over the sources of all corpora.
Definition: Database.cpp:1022
void prepare()
Prepares the SQL statements for the analyzer.
Definition: Database.cpp:554
constexpr auto progressCreatedCorpus
The progress with creating a corpus after the server created the corpus.
Definition: Database.hpp:101
constexpr auto sqlArg2
Second argument in a SQL query.
Definition: Database.hpp:130
constexpr auto progressGeneratedSavePoint
The progress of saving a savepoint after generating it.
Definition: Database.hpp:115
constexpr auto progressMovedData
The progress with creating a corpus after the data has been moved.
Definition: Database.hpp:98
std::string targetTableName
The name of the target table to be written to.
Definition: Database.hpp:288
Database(Module::Database &dbThread)
Constructor setting the database connection for the thread.
Definition: Database.cpp:44
constexpr auto sqlArg6
Sixth argument in a SQL query.
Definition: Database.hpp:142
Target table properties containing its type, website, URL list, table names, columns, and compression.
Definition: TargetTableProperties.hpp:44
void setTargetTable(const std::string &table)
Sets the name of the target table.
Definition: Database.cpp:57
Class handling database access for threads.
Definition: Database.hpp:91
std::unordered_map< std::size_t, std::string > additionalTables
The IDs and full names of additional tables to write data to.
Definition: Database.hpp:307
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Corpus properties containing the type, table, and column name of its source.
Definition: CorpusProperties.hpp:41
Class for JSON exceptions.
Definition: Json.hpp:136
void initTargetTable(bool isCompressed, bool isDelete)
Creates the target table, or adds the field columns, if they do not exist already.
Definition: Database.cpp:136
constexpr auto sqlArg8
Eighth argument in a SQL query.
Definition: Database.hpp:148
constexpr auto defaultCorpusSlicing
The default percentage of the maximum package size allowed by the MySQL server to be used for the max...
Definition: Database.hpp:83
void setTargetFields(const std::vector< StringString > &fields)
Sets the fields of the target table and their types.
Definition: Database.cpp:82
Class representing a text corpus.
Definition: Corpus.hpp:165
bool getCorpus(const CorpusProperties &corpusProperties, const std::string &filterDateFrom, const std::string &filterDateTo, Data::Corpus &corpusTo, std::size_t &sourcesTo, StatusSetter &statusSetter)
Gets the text corpus after creating it if it is out-of-date or does not yet exist.
Definition: Database.cpp:922
Type
Data types.
Definition: Data.hpp:66
constexpr auto progressReceivedSources
The progress with creating a corpus after the source texts have been received.
Definition: Database.hpp:95
constexpr auto numColumns2
Two table columns.
Definition: Database.hpp:179
Structure for table columns containing its name, type, reference, and indexing.
Definition: TableColumn.hpp:39
constexpr auto sqlArg4
Fourth argument in a SQL query.
Definition: Database.hpp:136
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::string getTargetTableUpdated()
Gets the date/time when the target table has been updated last.
Definition: Database.cpp:278
Wrapper class providing the database functionality of Module::Database to its child classes...
Definition: Database.hpp:72
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
void checkSources(std::vector< std::uint8_t > &types, std::vector< std::string > &tables, std::vector< std::string > &columns)
Public helper function checking the given data sources.
Definition: Database.cpp:1152
std::string tablePrefix
The prefix used for tables in the MySQL database.
Definition: Database.hpp:285
Class providing database functionality for analyzer threads by implementing Wrapper::Database.
Definition: Database.hpp:188
void setIsRunningCallback(const IsRunningCallback &isRunningCallback)
Sets the callback function for checking whether the thread is still running.
Definition: Database.cpp:104
constexpr auto progressDeletedCorpus
The progress with creating a corpus after the old corpus has been deleted.
Definition: Database.hpp:92
constexpr auto sqlArg1
First argument in a SQL query.
Definition: Database.hpp:127
constexpr auto progressAddingCorpus
The remaining progress, attributed to adding the corpus to the database.
Definition: Database.hpp:107
std::vector< StringString > targetFields
The names and types of the target fields, i.e. the columns in the target table to be written to...
Definition: Database.hpp:297
static std::string getSourceColumnName(std::uint16_t type, const std::string &name)
Public helper function getting the full name of a source column.
Definition: Database.cpp:1093
constexpr auto numColumns1
One table column.
Definition: Database.hpp:176
constexpr auto sqlArg9
Ninth argument in a SQL query.
Definition: Database.hpp:151
constexpr auto sqlArg3
Third argument in a SQL query.
Definition: Database.hpp:133
constexpr auto progressReceivedCorpus
The progress with getting an existing corpus after its contents have been received from the database...
Definition: Database.hpp:112
const std::string & getAdditionalTableName(std::size_t id) const
Gets the full name of an additional table.
Definition: Database.cpp:465
constexpr auto column1
First column in a table.
Definition: Database.hpp:167
void updateTargetTable()
Updates the target table.
Definition: Database.cpp:243
constexpr auto column3
Third column in a table.
Definition: Database.hpp:173
void updateAdditionalTable(std::size_t id)
Updates an additional table.
Definition: Database.cpp:500
constexpr auto progressSlicedCorpus
The progress with creating a corpus after the corpus has been sliced.
Definition: Database.hpp:104
constexpr auto sqlArg5
Fifth argument in a SQL query.
Definition: Database.hpp:139
Template class for safe in-scope database locks.
Definition: DatabaseLock.hpp:54
void setCorpusSlicing(std::uint8_t percentageOfMaxAllowedPackageSize)
Sets the size of corpus chunks, in percentage of the maximum package size allowed by the MySQL server...
Definition: Database.cpp:94
constexpr auto sqlArg10
Tenth argument in a SQL query.
Definition: Database.hpp:154
constexpr auto sqlArg7
Seventh argument in a SQL query.
Definition: Database.hpp:145
std::uint8_t corpusSlicing
The maximum size of the text corpus chunks, in percentage of the maximum package size allowed by the ...
Definition: Database.hpp:304
constexpr auto progressSavingSavePoint
The remaining progress, attributed to saving a savepoint to the database.
Definition: Database.hpp:118
std::size_t addAdditionalTable(const std::string &name, const std::vector< StringString > &fields, bool isCompressed, bool isDelete)
Creates an additional table, or adds its field columns, if they do not exist already.
Definition: Database.cpp:348
constexpr auto sqlArg11
Eleventh argument in a SQL query.
Definition: Database.hpp:157