crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Database.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Database.hpp
24  *
25  * This class provides database functionality for a crawler thread
26  * by implementing the Wrapper::Database interface.
27  *
28  * Created on: Oct 22, 2018
29  * Author: ans
30  */
31 
32 #ifndef MODULE_CRAWLER_DATABASE_HPP_
33 #define MODULE_CRAWLER_DATABASE_HPP_
34 
35 #include "Config.hpp"
36 
37 #include "../../Helper/CommaLocale.hpp"
38 #include "../../Helper/Utf8.hpp"
39 #include "../../Main/Exception.hpp"
40 #include "../../Wrapper/Database.hpp"
41 #include "../../Wrapper/Database.hpp"
42 
43 #include "../../Helper/Portability/mysqlcppconn.h"
44 
45 #include <cppconn/exception.h>
46 #include <cppconn/prepared_statement.h>
47 #include <cppconn/resultset.h>
48 #include <cppconn/statement.h>
49 #include <mysql_connection.h>
50 
51 #include <chrono> // std::chrono
52 #include <cstddef> // std::size_t
53 #include <cstdint> // std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
54 #include <memory> // std::unique_ptr
55 #include <queue> // std::queue
56 #include <sstream> // std::ostringstream
57 #include <string> // std::string, std::to_string
58 #include <string_view> // std::string_view, std::string_view_literals
59 #include <utility> // std::pair
60 
62 
63  /*
64  * CONSTANTS
65  */
66 
67  using std::string_view_literals::operator""sv;
68 
71 
73  inline constexpr auto maxContentSize{1073741824};
74 
76  inline constexpr auto maxContentSizeString{"1 GiB"sv};
77 
81 
83  inline constexpr auto nAtOnce10{10};
84 
86  inline constexpr auto nAtOnce100{100};
87 
89  inline constexpr auto sqlArg1{1};
90 
92  inline constexpr auto sqlArg2{2};
93 
95  inline constexpr auto sqlArg3{3};
96 
98  inline constexpr auto sqlArg4{4};
99 
101  inline constexpr auto sqlArg5{5};
102 
104  inline constexpr auto crawlingTableAlias{"a"sv};
105 
107  inline constexpr auto urlListTableAlias{"b"sv};
108 
110  inline constexpr auto numArgsAddUrl{5};
111 
113 
114  /*
115  * DECLARATION
116  */
117 
119  class Database final : public Wrapper::Database {
120  // for convenience
121  using IdString = std::pair<std::uint64_t, std::string>;
122  using SqlResultSetPtr = std::unique_ptr<sql::ResultSet>;
123 
124  public:
127 
128  explicit Database(Module::Database& dbThread);
129 
133 
134  void setMaxBatchSize(std::uint16_t setMaxBatchSize);
135  void setRecrawl(bool isRecrawl);
136  void setUrlCaseSensitive(bool isUrlCaseSensitive);
137  void setUrlDebug(bool isUrlDebug);
138  void setUrlStartupCheck(bool isUrlStartupCheck);
139 
143 
144  void prepare();
145 
149 
150  [[nodiscard]] std::uint64_t getUrlId(const std::string& url);
151  [[nodiscard]] IdString getNextUrl(std::uint64_t currentUrlId);
152  bool addUrlIfNotExists(const std::string& urlString, bool manual);
153  std::size_t addUrlsIfNotExist(std::queue<std::string>& urls, bool manual);
154  [[nodiscard]] std::uint64_t getUrlPosition(std::uint64_t urlId);
155  [[nodiscard]] std::uint64_t getNumberOfUrls();
156 
160 
161  void urlDuplicationCheck();
162  void urlHashCheck();
163  void urlEmptyCheck();
164  void urlUtf8Check();
165 
169 
170  [[nodiscard]] std::string getUrlLockTime(std::uint64_t urlId);
171  [[nodiscard]] bool isUrlCrawled(std::uint64_t urlId);
172  [[nodiscard]] std::string lockUrlIfOk(std::uint64_t urlId, const std::string& lockTime, std::uint32_t lockTimeout);
173  void unLockUrlIfOk(std::uint64_t urlId, const std::string& lockTime);
174  void setUrlFinishedIfOk(std::uint64_t urlId, const std::string& lockTime);
175 
179 
180  void saveContent(
181  std::uint64_t urlId,
182  std::uint32_t response,
183  const std::string& type,
184  const std::string& content
185  );
186  void saveArchivedContent(
187  std::uint64_t urlId,
188  const std::string& timeStamp,
189  std::uint32_t response,
190  const std::string& type,
191  const std::string& content);
192  [[nodiscard]] bool isArchivedContentExists(std::uint64_t urlId, const std::string& timeStamp);
193 
195 
198 
199  private:
200  // options
201  bool recrawl{false};
202  bool urlCaseSensitive{true};
203  bool urlDebug{false};
204  bool urlStartupCheck{true};
205  std::uint16_t maxBatchSize{defaultMaxBatchSize};
206 
207  // table names
208  std::string urlListTable;
209  std::string crawlingTable;
210 
211  // IDs of prepared SQL statements
212  struct _ps {
213  std::size_t getUrlId{};
214  std::size_t getNextUrl{};
215  std::size_t addUrlIfNotExists{};
216  std::size_t add10UrlsIfNotExist{};
217  std::size_t add100UrlsIfNotExist{};
218  std::size_t addMaxUrlsIfNotExist{};
219  std::size_t getUrlPosition{};
220  std::size_t getNumberOfUrls{};
221  std::size_t getUrlLockTime{};
222  std::size_t isUrlCrawled{};
223  std::size_t addUrlLockIfOk{};
224  std::size_t renewUrlLockIfOk{};
225  std::size_t unLockUrlIfOk{};
226  std::size_t setUrlFinishedIfOk{};
227  std::size_t saveContent{};
228  std::size_t saveArchivedContent{};
229  std::size_t isArchivedContentExists{};
230  std::size_t urlDuplicationCheck{};
231  std::size_t urlHashCheck{};
232  std::size_t urlHashCorrect{};
233  std::size_t urlEmptyCheck{};
234  std::size_t getUrls{};
235  std::size_t removeDuplicates{};
236  } ps;
237 
238  // internal helper functions
239  std::string queryAddUrlsIfNotExist(std::size_t numberOfUrls, const std::string& hashQuery);
240  [[nodiscard]] std::queue<std::string> getUrls();
241  std::uint32_t removeDuplicates(const std::string& url);
242  };
243 
244 } /* namespace crawlservpp::Module::Crawler */
245 
246 #endif /* MODULE_CRAWLER_DATABASE_HPP_ */
std::uint64_t getNumberOfUrls()
Gets the number of URL in the current URL list.
Definition: Database.cpp:987
void saveContent(std::uint64_t urlId, std::uint32_t response, const std::string &type, const std::string &content)
Saves crawled content to the database.
Definition: Database.cpp:1634
void unLockUrlIfOk(std::uint64_t urlId, const std::string &lockTime)
Unlocks a URL in the database.
Definition: Database.cpp:1501
constexpr auto maxContentSizeString
Maximum size of database content as string.
Definition: Database.hpp:76
void urlEmptyCheck()
Checks for empty URLs in the current URL list.
Definition: Database.cpp:1175
constexpr auto sqlArg2
Second argument in a SQL query.
Definition: Database.hpp:92
constexpr auto numArgsAddUrl
Number of arguments for adding one URL.
Definition: Database.hpp:110
void urlDuplicationCheck()
Checks the current URL list for duplicates.
Definition: Database.cpp:1045
Class handling database access for threads.
Definition: Database.hpp:91
void setUrlCaseSensitive(bool isUrlCaseSensitive)
Sets whether the current URL list is case-sensitive.
Definition: Database.cpp:87
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Namespace for crawler classes.
Definition: Config.hpp:44
void saveArchivedContent(std::uint64_t urlId, const std::string &timeStamp, std::uint32_t response, const std::string &type, const std::string &content)
Saves archived content to the database.
Definition: Database.cpp:1737
std::string getUrlLockTime(std::uint64_t urlId)
Gets the time, until which a URL has been locked.
Definition: Database.cpp:1283
void urlUtf8Check()
Checks for URLs containing invalid UTF-8 characters in the current URL list.
Definition: Database.cpp:1229
void setUrlFinishedIfOk(std::uint64_t urlId, const std::string &lockTime)
Sets the URL to crawled in the database, if it is still locked by the thread.
Definition: Database.cpp:1563
void setMaxBatchSize(std::uint16_t setMaxBatchSize)
Sets the maximum number of URLs to be processed at once.
Definition: Database.cpp:58
void setRecrawl(bool isRecrawl)
Sets whether all URLs will be recrawled.
Definition: Database.cpp:70
bool addUrlIfNotExists(const std::string &urlString, bool manual)
Adds a URL to the database, if it doesnt exist already.
Definition: Database.cpp:731
Wrapper class providing the database functionality of Module::Database to its child classes...
Definition: Database.hpp:72
Class providing database functionality for crawler threads by implementing Wrapper::Database.
Definition: Database.hpp:119
constexpr auto crawlingTableAlias
Alias, used in SQL queries, for the crawling table.
Definition: Database.hpp:104
constexpr auto sqlArg1
First argument in a SQL query.
Definition: Database.hpp:89
constexpr auto nAtOnce100
Process one hundred values at once.
Definition: Database.hpp:86
bool isArchivedContentExists(std::uint64_t urlId, const std::string &timeStamp)
Checks whether archived content for a URL with a specific timestamp already exists in the database...
Definition: Database.cpp:1839
constexpr auto maxContentSize
Maximum size of database content (= 1 GiB).
Definition: Database.hpp:73
Database(Module::Database &dbThread)
Constructor setting the database connection for the thread.
Definition: Database.cpp:45
constexpr auto nAtOnce10
Process ten values at once.
Definition: Database.hpp:83
void urlHashCheck()
Checks the hash values in the current URL list.
Definition: Database.cpp:1112
std::uint64_t getUrlId(const std::string &url)
Gets the ID of a URL from the database.
Definition: Database.cpp:614
void setUrlStartupCheck(bool isUrlStartupCheck)
Sets whether to check URLs on startup.
Definition: Database.cpp:116
constexpr auto sqlArg5
Fifth argument in a SQL query.
Definition: Database.hpp:101
void prepare()
Prepares the SQL statements for the crawler.
Definition: Database.cpp:133
constexpr auto sqlArg3
Third argument in a SQL query.
Definition: Database.hpp:95
std::string lockUrlIfOk(std::uint64_t urlId, const std::string &lockTime, std::uint32_t lockTimeout)
Locks a URL if it is lockable or still locked by the current thread.
Definition: Database.cpp:1417
std::size_t addUrlsIfNotExist(std::queue< std::string > &urls, bool manual)
Adds URLs to the database, if they do not exist already.
Definition: Database.cpp:802
IdString getNextUrl(std::uint64_t currentUrlId)
Gets the ID of the next URL to crawl from the database.
Definition: Database.cpp:674
bool isUrlCrawled(std::uint64_t urlId)
Gets whether a URL has been crawled.
Definition: Database.cpp:1340
std::uint64_t getUrlPosition(std::uint64_t urlId)
Gets the position of a URL in the current URL list.
Definition: Database.cpp:928
constexpr std::uint16_t defaultMaxBatchSize
Default number of URLs to be processed in one MySQL query.
Definition: Config.hpp:75
void setUrlDebug(bool isUrlDebug)
Sets whether to enable URL debugging.
Definition: Database.cpp:102
constexpr auto urlListTableAlias
Alias, used in SQL queries, for the URL list table.
Definition: Database.hpp:107
constexpr auto sqlArg4
Fourth argument in a SQL query.
Definition: Database.hpp:98