32 #ifndef MODULE_CRAWLER_DATABASE_HPP_ 33 #define MODULE_CRAWLER_DATABASE_HPP_ 37 #include "../../Helper/CommaLocale.hpp" 38 #include "../../Helper/Utf8.hpp" 39 #include "../../Main/Exception.hpp" 40 #include "../../Wrapper/Database.hpp" 41 #include "../../Wrapper/Database.hpp" 43 #include "../../Helper/Portability/mysqlcppconn.h" 45 #include <cppconn/exception.h> 46 #include <cppconn/prepared_statement.h> 47 #include <cppconn/resultset.h> 48 #include <cppconn/statement.h> 49 #include <mysql_connection.h> 58 #include <string_view> 67 using std::string_view_literals::operator
""sv;
121 using IdString = std::pair<std::uint64_t, std::string>;
122 using SqlResultSetPtr = std::unique_ptr<sql::ResultSet>;
150 [[nodiscard]] std::uint64_t
getUrlId(
const std::string& url);
151 [[nodiscard]] IdString
getNextUrl(std::uint64_t currentUrlId);
172 [[nodiscard]] std::string
lockUrlIfOk(std::uint64_t urlId,
const std::string& lockTime, std::uint32_t lockTimeout);
173 void unLockUrlIfOk(std::uint64_t urlId,
const std::string& lockTime);
182 std::uint32_t response,
183 const std::string& type,
184 const std::string& content
188 const std::string& timeStamp,
189 std::uint32_t response,
190 const std::string& type,
191 const std::string& content);
202 bool urlCaseSensitive{
true};
203 bool urlDebug{
false};
204 bool urlStartupCheck{
true};
208 std::string urlListTable;
209 std::string crawlingTable;
213 std::size_t getUrlId{};
214 std::size_t getNextUrl{};
215 std::size_t addUrlIfNotExists{};
216 std::size_t add10UrlsIfNotExist{};
217 std::size_t add100UrlsIfNotExist{};
218 std::size_t addMaxUrlsIfNotExist{};
219 std::size_t getUrlPosition{};
220 std::size_t getNumberOfUrls{};
221 std::size_t getUrlLockTime{};
222 std::size_t isUrlCrawled{};
223 std::size_t addUrlLockIfOk{};
224 std::size_t renewUrlLockIfOk{};
225 std::size_t unLockUrlIfOk{};
226 std::size_t setUrlFinishedIfOk{};
227 std::size_t saveContent{};
228 std::size_t saveArchivedContent{};
229 std::size_t isArchivedContentExists{};
230 std::size_t urlDuplicationCheck{};
231 std::size_t urlHashCheck{};
232 std::size_t urlHashCorrect{};
233 std::size_t urlEmptyCheck{};
234 std::size_t getUrls{};
235 std::size_t removeDuplicates{};
239 std::string queryAddUrlsIfNotExist(std::size_t numberOfUrls,
const std::string& hashQuery);
240 [[nodiscard]] std::queue<std::string> getUrls();
241 std::uint32_t removeDuplicates(
const std::string& url);
std::uint64_t getNumberOfUrls()
Gets the number of URL in the current URL list.
Definition: Database.cpp:987
void saveContent(std::uint64_t urlId, std::uint32_t response, const std::string &type, const std::string &content)
Saves crawled content to the database.
Definition: Database.cpp:1634
void unLockUrlIfOk(std::uint64_t urlId, const std::string &lockTime)
Unlocks a URL in the database.
Definition: Database.cpp:1501
constexpr auto maxContentSizeString
Maximum size of database content as string.
Definition: Database.hpp:76
void urlEmptyCheck()
Checks for empty URLs in the current URL list.
Definition: Database.cpp:1175
constexpr auto sqlArg2
Second argument in a SQL query.
Definition: Database.hpp:92
constexpr auto numArgsAddUrl
Number of arguments for adding one URL.
Definition: Database.hpp:110
void urlDuplicationCheck()
Checks the current URL list for duplicates.
Definition: Database.cpp:1045
Class handling database access for threads.
Definition: Database.hpp:91
void setUrlCaseSensitive(bool isUrlCaseSensitive)
Sets whether the current URL list is case-sensitive.
Definition: Database.cpp:87
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Namespace for crawler classes.
Definition: Config.hpp:44
void saveArchivedContent(std::uint64_t urlId, const std::string &timeStamp, std::uint32_t response, const std::string &type, const std::string &content)
Saves archived content to the database.
Definition: Database.cpp:1737
std::string getUrlLockTime(std::uint64_t urlId)
Gets the time, until which a URL has been locked.
Definition: Database.cpp:1283
void urlUtf8Check()
Checks for URLs containing invalid UTF-8 characters in the current URL list.
Definition: Database.cpp:1229
void setUrlFinishedIfOk(std::uint64_t urlId, const std::string &lockTime)
Sets the URL to crawled in the database, if it is still locked by the thread.
Definition: Database.cpp:1563
void setMaxBatchSize(std::uint16_t setMaxBatchSize)
Sets the maximum number of URLs to be processed at once.
Definition: Database.cpp:58
void setRecrawl(bool isRecrawl)
Sets whether all URLs will be recrawled.
Definition: Database.cpp:70
bool addUrlIfNotExists(const std::string &urlString, bool manual)
Adds a URL to the database, if it doesnt exist already.
Definition: Database.cpp:731
Wrapper class providing the database functionality of Module::Database to its child classes...
Definition: Database.hpp:72
Class providing database functionality for crawler threads by implementing Wrapper::Database.
Definition: Database.hpp:119
constexpr auto crawlingTableAlias
Alias, used in SQL queries, for the crawling table.
Definition: Database.hpp:104
constexpr auto sqlArg1
First argument in a SQL query.
Definition: Database.hpp:89
constexpr auto nAtOnce100
Process one hundred values at once.
Definition: Database.hpp:86
bool isArchivedContentExists(std::uint64_t urlId, const std::string &timeStamp)
Checks whether archived content for a URL with a specific timestamp already exists in the database...
Definition: Database.cpp:1839
constexpr auto maxContentSize
Maximum size of database content (= 1 GiB).
Definition: Database.hpp:73
Database(Module::Database &dbThread)
Constructor setting the database connection for the thread.
Definition: Database.cpp:45
constexpr auto nAtOnce10
Process ten values at once.
Definition: Database.hpp:83
void urlHashCheck()
Checks the hash values in the current URL list.
Definition: Database.cpp:1112
std::uint64_t getUrlId(const std::string &url)
Gets the ID of a URL from the database.
Definition: Database.cpp:614
void setUrlStartupCheck(bool isUrlStartupCheck)
Sets whether to check URLs on startup.
Definition: Database.cpp:116
constexpr auto sqlArg5
Fifth argument in a SQL query.
Definition: Database.hpp:101
void prepare()
Prepares the SQL statements for the crawler.
Definition: Database.cpp:133
constexpr auto sqlArg3
Third argument in a SQL query.
Definition: Database.hpp:95
std::string lockUrlIfOk(std::uint64_t urlId, const std::string &lockTime, std::uint32_t lockTimeout)
Locks a URL if it is lockable or still locked by the current thread.
Definition: Database.cpp:1417
std::size_t addUrlsIfNotExist(std::queue< std::string > &urls, bool manual)
Adds URLs to the database, if they do not exist already.
Definition: Database.cpp:802
IdString getNextUrl(std::uint64_t currentUrlId)
Gets the ID of the next URL to crawl from the database.
Definition: Database.cpp:674
bool isUrlCrawled(std::uint64_t urlId)
Gets whether a URL has been crawled.
Definition: Database.cpp:1340
std::uint64_t getUrlPosition(std::uint64_t urlId)
Gets the position of a URL in the current URL list.
Definition: Database.cpp:928
constexpr std::uint16_t defaultMaxBatchSize
Default number of URLs to be processed in one MySQL query.
Definition: Config.hpp:75
void setUrlDebug(bool isUrlDebug)
Sets whether to enable URL debugging.
Definition: Database.cpp:102
constexpr auto urlListTableAlias
Alias, used in SQL queries, for the URL list table.
Definition: Database.hpp:107
constexpr auto sqlArg4
Fourth argument in a SQL query.
Definition: Database.hpp:98