32 #ifndef MODULE_PARSER_DATABASE_HPP_ 33 #define MODULE_PARSER_DATABASE_HPP_ 35 #include "../../Helper/Portability/mysqlcppconn.h" 36 #include "../../Main/Exception.hpp" 37 #include "../../Struct/DataEntry.hpp" 38 #include "../../Struct/StatusSetter.hpp" 39 #include "../../Struct/TableColumn.hpp" 40 #include "../../Struct/TargetTableProperties.hpp" 41 #include "../../Wrapper/Database.hpp" 45 #include <cppconn/exception.h> 46 #include <cppconn/prepared_statement.h> 47 #include <cppconn/resultset.h> 48 #include <cppconn/statement.h> 49 #include <mysql_connection.h> 59 #include <string_view> 69 using std::string_view_literals::operator
""sv;
146 using IdString = std::pair<std::uint64_t, std::string>;
147 using SqlResultSetPtr = std::unique_ptr<sql::ResultSet>;
184 std::uint64_t lastId,
185 std::queue<IdString>& cache,
186 std::uint32_t lockTimeout
195 [[nodiscard]] std::string
getLockTime(std::uint32_t lockTimeout);
199 const std::string& lockTime,
200 std::uint32_t lockTimeout
202 bool unLockUrlIfOk(std::uint64_t urlId,
const std::string& lockTime);
203 void unLockUrlsIfOk(std::queue<IdString>& urls, std::string& lockTime);
213 const std::string& lastDateTime,
215 std::string& dateTimeTo
217 [[nodiscard]] std::queue<IdString>
getAllContents(std::uint64_t urlId);
219 const std::string& parsedId
222 std::queue<DataEntry>& entries,
238 bool parseCustom{
true};
239 std::string targetTableName;
240 std::vector<std::string> targetFieldNames;
243 std::string urlListTable;
244 std::string parsingTable;
245 std::uint64_t targetTableId{};
246 std::string targetTableFull;
250 std::size_t fetchUrls{};
251 std::size_t lockUrl{};
252 std::size_t lock10Urls{};
253 std::size_t lock100Urls{};
254 std::size_t lockMaxUrls{};
255 std::size_t getUrlPosition{};
256 std::size_t getNumberOfUrls{};
257 std::size_t getLockTime{};
258 std::size_t getUrlLockTime{};
259 std::size_t renewUrlLockIfOk{};
260 std::size_t unLockUrlIfOk{};
261 std::size_t checkParsingTable{};
262 std::size_t getNumberOfContents{};
263 std::size_t getLatestContent{};
264 std::size_t getAllContents{};
265 std::size_t getContentIdFromParsedId{};
266 std::size_t updateOrAddEntry{};
267 std::size_t updateOrAdd10Entries{};
268 std::size_t updateOrAdd100Entries{};
269 std::size_t updateOrAddMaxEntries{};
270 std::size_t setUrlFinishedIfLockOk{};
271 std::size_t set10UrlsFinishedIfLockOk{};
272 std::size_t set100UrlsFinishedIfLockOk{};
273 std::size_t setMaxUrlsFinishedIfLockOk{};
274 std::size_t updateTargetTable{};
278 bool checkEntrySize(DataEntry& entry);
279 [[nodiscard]] std::string queryLockUrls(std::size_t numberOfUrls);
280 [[nodiscard]] std::string queryUpdateOrAddEntries(std::size_t numberOfEntries);
281 [[nodiscard]] std::string querySetUrlsFinishedIfLockOk(std::size_t numberOfUrls);
282 [[nodiscard]] std::string queryUnlockUrlsIfOk(std::size_t numberOfUrls);
std::string renewUrlLockIfOk(std::uint64_t urlId, const std::string &lockTime, std::uint32_t lockTimeout)
Locks a URL in the database, if it is lockable, or extends its locking time, if it is still locked by...
Definition: Database.cpp:954
constexpr auto parsingTableAlias
Alias, used in SQL queries, for the parsing table.
Definition: Database.hpp:112
constexpr auto oneAtOnce
Process one value at once.
Definition: Database.hpp:85
std::uint64_t getNumberOfUrls()
Gets the number of URLs in the URL list.
Definition: Database.cpp:773
void updateTargetTable()
Updates the target table.
Definition: Database.cpp:1839
A data entry containing either parsed or extracted data.
Definition: DataEntry.hpp:45
void setTargetTable(const std::string &table)
Sets the name of the target table.
Definition: Database.cpp:118
constexpr auto sqlArg2
Second argument in a SQL query.
Definition: Database.hpp:97
void setMaxBatchSize(std::uint16_t setMaxBatchSize)
Sets the maximum number of URLs to be processed at once.
Definition: Database.cpp:72
constexpr auto numArgsFinishUrl
Number of arguments for setting one URL to finished.
Definition: Database.hpp:127
Target table properties containing its type, website, URL list, table names, columns, and compression.
Definition: TargetTableProperties.hpp:44
constexpr auto minArsgAddUpdateData
Minimum number of arguments to add or update a data entry.
Definition: Database.hpp:124
Class handling database access for threads.
Definition: Database.hpp:91
Namespace for parser classes.
Definition: Config.hpp:43
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Class providing database functionality for parser threads by implementing Wrapper::Database.
Definition: Database.hpp:139
constexpr auto minTargetColumns
Minimum number of columns in the target table.
Definition: Database.hpp:118
std::uint32_t checkParsingTable()
Checks the parsing table.
Definition: Database.cpp:1153
void setParseCustom(bool isParseCustom)
Sets whether to parse data from custom URLs.
Definition: Database.cpp:102
bool getLatestContent(std::uint64_t urlId, const std::string &lastDateTime, IdString &contentTo, std::string &dateTimeTo)
Gets crawled content stored in the database for a specific URL.
Definition: Database.cpp:1281
std::uint64_t getNumberOfContents(std::uint64_t urlId)
Gets the number of crawled contents stored for a specific URL from the database.
Definition: Database.cpp:1203
void setReparse(bool isReparse)
Sets whether to re-parse data from already processed URLs.
Definition: Database.cpp:87
Structure for table columns containing its name, type, reference, and indexing.
Definition: TableColumn.hpp:39
constexpr auto sqlArg6
Sixth argument in a SQL query.
Definition: Database.hpp:109
constexpr auto sqlArg3
Third argument in a SQL query.
Definition: Database.hpp:100
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
Wrapper class providing the database functionality of Module::Database to its child classes...
Definition: Database.hpp:72
void initTargetTable()
Creates the target table, if it does not exist, or adds target columns needed by the parser...
Definition: Database.cpp:160
constexpr auto maxDateTimeValue
The maximum value of a DATETIME in the database.
Definition: Database.hpp:130
void setTargetFields(const std::vector< std::string > &fields)
Sets the columns of the target table.
Definition: Database.cpp:136
constexpr auto nAtOnce10
Process ten values at once.
Definition: Database.hpp:88
std::string getUrlLockTime(std::uint64_t urlId)
Gets the current lock expiration time for a URL from the database.
Definition: Database.cpp:888
void unLockUrlsIfOk(std::queue< IdString > &urls, std::string &lockTime)
Unlocks multiple URLs in the database at once.
Definition: Database.cpp:1083
constexpr auto numArgsLockUrl
Number of arguments for locking one URL.
Definition: Database.hpp:121
constexpr auto sqlArg5
Fifth argument in a SQL query.
Definition: Database.hpp:106
void updateOrAddEntries(std::queue< DataEntry > &entries, StatusSetter &statusSetter)
Adds parsed data to the database, or updates data that already exists.
Definition: Database.cpp:1490
constexpr auto maxContentSize
Maximum size of database content (= 1 GiB).
Definition: Database.hpp:75
constexpr auto targetTableAlias
Alias, used in SQL queries, for the target table.
Definition: Database.hpp:115
constexpr auto sqlArg1
First argument in a SQL query.
Definition: Database.hpp:94
std::uint64_t getContentIdFromParsedId(const std::string &parsedId)
Gets the latest content ID from a parsed ID.
Definition: Database.cpp:1428
std::string getLockTime(std::uint32_t lockTimeout)
Gets the current URL lock expiration time from the database.
Definition: Database.cpp:832
constexpr std::uint16_t defaultMaxBatchSize
Default maximum number of URLs to be processed in one MySQL query.
Definition: Config.hpp:77
constexpr std::uint64_t defaultCacheSize
Default cache size.
Definition: Config.hpp:71
std::queue< IdString > getAllContents(std::uint64_t urlId)
Gets all crawled contents stored in the database for a specific URL.
Definition: Database.cpp:1363
std::string fetchUrls(std::uint64_t lastId, std::queue< IdString > &cache, std::uint32_t lockTimeout)
Fetches, locks, and adds the next URLs to the cache, i.e. to the caching queue to be processed...
Definition: Database.cpp:585
constexpr auto sqlArg4
Fourth argument in a SQL query.
Definition: Database.hpp:103
void prepare()
Prepares the SQL statements needed by the parser.
Definition: Database.cpp:225
void setCacheSize(std::uint64_t setCacheSize)
Sets the maximum cache size for URLs.
Definition: Database.cpp:62
bool unLockUrlIfOk(std::uint64_t urlId, const std::string &lockTime)
Unlocks a URL in the database.
Definition: Database.cpp:1022
std::uint64_t getUrlPosition(std::uint64_t urlId)
Gets the position of a URL in the URL list.
Definition: Database.cpp:715
constexpr auto nAtOnce100
Process one hundred values at once.
Definition: Database.hpp:91
void setUrlsFinishedIfLockOk(std::queue< IdString > &finished)
Sets URLs to finished in the database, except those locked by another thread.
Definition: Database.cpp:1744
constexpr auto maxContentSizeString
Maximum size of database content as string.
Definition: Database.hpp:78
Database(Module::Database &dbThread)
Constructor setting the database connection for the thread.
Definition: Database.cpp:45