38 #ifndef MAIN_SERVER_HPP_ 39 #define MAIN_SERVER_HPP_ 60 #define MAIN_SERVER_CMD(X, Y) if(name == (X)) { \ 69 #define MAIN_SERVER_WORKER_BEGIN try { 71 #define MAIN_SERVER_WORKER_END(X) } \ 72 catch(const std::exception& e) { \ 73 (X) = ServerCommandResponse::failed( \ 86 #include "../Data/Compression/Gzip.hpp" 87 #include "../Data/Compression/Zip.hpp" 88 #include "../Data/Compression/Zlib.hpp" 89 #include "../Data/Corpus.hpp" 90 #include "../Data/File.hpp" 91 #include "../Data/ImportExport/OpenDocument.hpp" 92 #include "../Data/ImportExport/Text.hpp" 93 #include "../Helper/CommaLocale.hpp" 94 #include "../Helper/DateTime.hpp" 95 #include "../Helper/FileSystem.hpp" 96 #include "../Helper/Json.hpp" 97 #include "../Helper/Memory.hpp" 98 #include "../Helper/Strings.hpp" 99 #include "../Module/Analyzer/Algo/All.hpp" 100 #include "../Module/Crawler/Thread.hpp" 101 #include "../Module/Extractor/Thread.hpp" 102 #include "../Module/Parser/Thread.hpp" 103 #include "../Module/Thread.hpp" 104 #include "../Query/JsonPath.hpp" 105 #include "../Query/JsonPointer.hpp" 106 #include "../Query/RegEx.hpp" 107 #include "../Query/XPath.hpp" 108 #include "../Struct/AlgoThreadProperties.hpp" 109 #include "../Struct/ConfigProperties.hpp" 110 #include "../Struct/NetworkSettings.hpp" 111 #include "../Struct/QueryProperties.hpp" 112 #include "../Struct/ServerCommandResponse.hpp" 113 #include "../Struct/ServerSettings.hpp" 114 #include "../Struct/ThreadDatabaseEntry.hpp" 115 #include "../Struct/ThreadOptions.hpp" 116 #include "../Struct/UrlListProperties.hpp" 117 #include "../Struct/WebsiteProperties.hpp" 118 #include "../Timer/SimpleHR.hpp" 119 #include "../Wrapper/Database.hpp" 121 #include "../_extern/jsoncons/include/jsoncons/json.hpp" 122 #include "../_extern/jsoncons/include/jsoncons_ext/jsonpath/json_query.hpp" 123 #include "../_extern/rapidjson/include/rapidjson/document.h" 124 #include "../_extern/rapidjson/include/rapidjson/prettywriter.h" 126 #include <boost/lexical_cast.hpp> 140 #include <string_view> 148 using std::string_view_literals::operator
""sv;
269 using ConnectionPtr = mg_connection *;
270 using ConstConnectionPtr =
const mg_connection *;
271 using StringString = std::pair<std::string, std::string>;
272 using Queries = std::vector<std::pair<std::string, std::vector<StringString>>>;
289 [[nodiscard]]
const std::string&
getStatus()
const;
290 [[nodiscard]] std::int64_t
getUpTime()
const;
333 std::chrono::steady_clock::time_point uptimeStart;
337 std::vector<std::unique_ptr<Module::Crawler::Thread>> crawlers;
338 std::vector<std::unique_ptr<Module::Parser::Thread>> parsers;
339 std::vector<std::unique_ptr<Module::Extractor::Thread>> extractors;
340 std::vector<std::unique_ptr<Module::Analyzer::Thread>> analyzers;
341 std::vector<std::thread> workers;
342 std::vector<bool> workersRunning;
343 mutable std::mutex workersLock;
346 rapidjson::Document cmdJson;
358 void setStatus(
const std::string& statusMsg);
361 void onAccept(ConnectionPtr connection);
363 ConnectionPtr connection,
364 const std::string& method,
365 const std::string& body,
370 [[nodiscard]] std::string cmd(
371 ConnectionPtr connection,
372 const std::string& msgBody,
373 bool& threadStartedTo,
438 void cmdImport(ConnectionPtr connection, std::size_t threadIndex,
const std::string& message);
439 void cmdMerge(ConnectionPtr connection, std::size_t threadIndex,
const std::string& message);
440 void cmdExport(ConnectionPtr connection, std::size_t threadIndex,
const std::string& message);
442 void cmdDeleteUrls(ConnectionPtr connection, std::size_t threadIndex,
const std::string& message);
444 void cmdTestQuery(ConnectionPtr connection, std::size_t threadIndex,
const std::string& message);
447 static void initCacheDir();
448 static void initDir(std::string_view directory);
449 static void initDebuggingDir(
bool isEnabled, std::string_view directory);
450 void initDatabase(std::uint16_t sleepOnSqlErrorS);
451 void initCallbacks();
452 void initWebServer(
const std::string& port);
454 void initStartLogging();
457 void clearModuleThreads();
458 void clearWorkerThreads();
459 void clearLogShutdown();
462 void tickPollWebServer();
463 void tickRemoveFinishedModuleThreads();
464 void tickRemoveFinishedWorkerThreads();
465 void tickReconnectIfOffline();
470 static std::string getIp(ConstConnectionPtr connection, std::string_view
function);
477 const std::string& name,
481 std::string& outError
484 const std::string& name,
486 std::string& outError
489 const std::string& name,
492 std::string& outError
496 std::size_t threadIndex,
497 ConnectionPtr connection,
498 const std::string& message,
505 [[nodiscard]]
static bool workerBegin(
506 const std::string& message,
507 rapidjson::Document& json,
511 static bool getArgument(
512 const rapidjson::Document& json,
513 const std::string& name,
517 std::string& outError
519 static bool getArgument(
520 const rapidjson::Document& json,
521 const std::string& name,
523 std::string& outError
525 static bool getArgument(
526 const rapidjson::Document& json,
527 const std::string& name,
530 std::string& outError
533 static void correctDomain(std::string& inOut);
535 [[nodiscard]]
static bool checkNameSpace(
536 const std::string& name,
540 [[nodiscard]]
static std::uint32_t getAlgoFromConfig(
541 const rapidjson::Document& json
544 [[nodiscard]]
static std::string generateReply(
546 const std::string& msgBody
549 [[nodiscard]]
static std::string dateTimeTest(
550 const std::string& input,
551 const std::string& format,
552 const std::string& locale
555 static bool cmdExportGetArguments(
556 const rapidjson::Document& json,
557 std::string& dataTypeTo,
558 std::string& fileTypeTo,
559 std::string& compressionTo,
562 static bool cmdExportRetrieveAndConvert(
563 const rapidjson::Document& json,
564 const std::string& dataType,
565 const std::string& fileType,
567 std::string& contentTo,
570 static bool cmdExportCompress(
571 const std::string& dataType,
572 const std::string& compression,
573 std::string contentInOut,
576 static void cmdExportWrite(
577 const std::string& content,
580 static void cmdExportLogSuccess(
583 const std::string& timeString
585 static bool cmdExportRetrieveUrlList(
586 const rapidjson::Document& json,
588 std::queue<std::string>& urlsTo,
591 static bool cmdExportRetrieveTable(
592 std::string_view type,
593 const rapidjson::Document& json,
596 std::vector<std::vector<std::string>>& contentTo,
597 bool& isColumnNamesTo,
600 static bool cmdExportRetrieveCorpus(
601 const rapidjson::Document& json,
603 std::queue<std::string>& urlsTo,
606 static bool cmdExportGetUrlListArguments(
607 const rapidjson::Document& json,
608 std::uint64_t& websiteTo,
609 std::uint64_t& urlListTo,
612 static bool cmdExportGetTableArguments(
613 const rapidjson::Document& json,
614 std::uint64_t& websiteTo,
615 std::uint64_t& urlListTo,
616 std::uint64_t& sourceTableTo,
617 bool& isColumnNamesTo,
620 static bool cmdExportGetCorpusArguments(
621 const rapidjson::Document& json,
622 std::uint64_t& corpusTo,
626 static bool cmdExportCheckWebsiteUrlList(
628 std::uint64_t websiteId,
629 std::uint64_t urlListId,
632 static bool cmdExportCheckTargetTable(
634 std::string_view dataType,
635 std::uint64_t websiteId,
636 std::uint64_t urlListId,
637 std::uint64_t tableId,
640 static bool cmdExportCheckCorpus(
642 std::uint64_t firstId,
643 std::string_view what,
646 static void cmdExportGetTableContent(
648 std::string_view dataType,
649 std::uint64_t websiteId,
650 std::uint64_t urlListId,
651 std::uint64_t tableId,
653 std::vector<std::vector<std::string>>& contentTo,
654 bool isIncludeColumnNames
656 [[nodiscard]]
static std::string cmdExportGetCorpus(
658 std::uint64_t firstChunkId
660 [[nodiscard]]
static std::queue<std::string> cmdExportGetKeysFromCorpusMaps(
662 std::uint64_t firstChunkId,
663 std::string_view what
665 static void cmdExportGetKeysFromCorpusMap(
666 const std::string& map,
667 std::queue<std::string>& appendKeysTo
669 static void cmdExportRemoveColumnPrefixes(
670 std::string_view type,
671 std::vector<std::vector<std::string>>& content
673 static void cmdExportLog(
675 std::string_view entryType,
676 std::string_view entryTypes,
677 std::string_view listType,
678 std::uint64_t entryNum
680 static bool cmdExportListAsText(
681 const rapidjson::Document& json,
682 std::queue<std::string>& data,
683 std::string& contentTo,
686 static bool cmdExportListAsSpreadsheet(
687 const rapidjson::Document& json,
688 std::queue<std::string>& data,
689 std::string& contentTo,
692 static void cmdExportTableAsSpreadsheet(
693 const std::string& tableName,
694 const std::vector<std::vector<std::string>>& tableContent,
695 std::string& contentTo,
698 static bool cmdExportGetFirstLineHeader(
699 const rapidjson::Document& json,
700 std::optional<std::string>& optHeaderTo,
703 static bool cmdDeleteUrlsGetArguments(
704 const rapidjson::Document& json,
705 std::uint64_t& urlListTo,
706 std::uint64_t& queryTo,
709 static bool cmdDeleteUrlsGetWebsite(
711 std::uint64_t urlList,
712 std::uint64_t& websiteTo,
715 static bool cmdDeleteUrlsGetQuery(
718 std::string& regExTo,
721 static bool cmdDeleteUrlsGetUrls(
723 std::uint64_t urlList,
724 const std::string& regEx,
725 std::queue<std::uint64_t>& toDeleteTo,
730 std::uint64_t urlList,
731 std::queue<std::uint64_t>& toDelete
736 template<
typename T>
static void interruptModuleThreads(
737 std::vector<std::unique_ptr<T>>& threads
739 for(
auto& thread : threads) {
740 thread->Module::Thread::interrupt();
744 template<
typename T>
static void waitForModuleThreads(
745 std::vector<std::unique_ptr<T>>& threads,
746 std::string_view moduleName,
747 std::queue<std::string>& logEntriesTo
749 for(
auto& thread : threads) {
752 const auto id{thread->getId()};
755 thread->Module::Thread::end();
758 std::string logString{moduleName};
761 logString += std::to_string(
id);
762 logString +=
" interrupted.";
764 logEntriesTo.emplace(logString);
771 template<
typename T>
static std::size_t countModuleThreads(
772 const std::vector<std::unique_ptr<T>>& threads
774 return std::count_if(
777 [](
const auto& thread) {
778 return thread->isRunning();
783 template<
typename T>
static void removeFinishedModuleThreads(
784 std::vector<std::unique_ptr<T>>& threads
791 if(crawler->isShutdown() && crawler->isFinished()) {
792 crawler->Module::Thread::end();
804 template<
typename T>
void continueModuleThread(
806 std::vector<std::unique_ptr<T>>& to
818 to.back()->Module::Thread::start();
830 template<
typename DB>
static bool checkWebsite(
832 std::uint64_t website,
835 if(db.isWebsite(website)) {
841 + std::to_string(website)
848 template<
typename DB>
static bool checkUrlList(
850 std::uint64_t urlList,
853 if(db.isUrlList(urlList)) {
859 + std::to_string(urlList)
866 template<
typename DB>
static bool checkUrlList(
868 std::uint64_t website,
869 std::uint64_t urlList,
872 if(db.isUrlList(website, urlList)) {
878 + std::to_string(urlList)
880 + std::to_string(website)
887 template<
typename DB>
static bool checkQuery(
889 std::uint64_t queryId,
892 if(db.isQuery(queryId)) {
898 + std::to_string(queryId)
905 template<
typename DB>
static bool checkQuery(
907 std::uint64_t website,
908 std::uint64_t queryId,
911 if(db.isQuery(website, queryId)) {
917 + std::to_string(queryId)
919 + std::to_string(website)
926 template<
typename DB>
static bool checkCorpus(
928 std::uint64_t firstId,
929 bool requireArticleMap,
933 if(db.isCorpus(firstId, requireArticleMap, requireDateMap)) {
937 auto error{
"Corpus starting at #" + std::to_string(firstId)};
939 if(requireArticleMap && db.isCorpus(firstId,
false,
false)) {
940 error +=
" does not include articles.";
942 else if(requireDateMap && db.isCorpus(firstId,
false,
false)) {
943 error +=
" does not include dates.";
946 error +=
" not found.";
Class for storage engine exceptions.
Definition: Database.hpp:755
Class for XPath exceptions.
Definition: XPath.hpp:104
constexpr auto mdlDir
The name of the (sub-)directory for language models.
Definition: Server.hpp:173
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
Class for JSONPointer exceptions.
Definition: JsonPointer.hpp:97
Response from the command-and-control server.
Definition: ServerCommandResponse.hpp:40
Class for insufficient privileges exceptions.
Definition: Database.hpp:758
virtual ~Server()
Destructor interrupting and waiting for all threads.
Definition: Server.cpp:91
constexpr auto minNameSpaceLengthString
The minimum length of namespaces, as string.
Definition: Server.hpp:197
Server(const ServerSettings &serverSettings, const DatabaseSettings &databaseSettings, const NetworkSettings &networkSettings)
Constructor setting server, database, and network settings.
Definition: Server.cpp:64
constexpr auto minNameSpaceLength
The minimum length of namespaces.
Definition: Server.hpp:194
Server & operator=(Server &)=delete
Deleted copy assignment operator.
ThreadOptions options
Options for the thread.
Definition: ThreadDatabaseEntry.hpp:45
constexpr auto dataTypeColumnNameSeparatorLong
The (long) separator used between data type and column name.
Definition: Server.hpp:212
constexpr auto dataTypeColumnNameSeparatorShort
The (short) separator used between data type and column name.
Definition: Server.hpp:209
bool tick()
Perform a server tick.
Definition: Server.cpp:174
constexpr auto replyHttpContentType
The HTTP content type for POST replies.
Definition: Server.hpp:188
Network settings containing the default proxy as well as host, port, and password of the TOR control ...
Definition: NetworkSettings.hpp:49
constexpr auto cookieDir
The name of the (sub-)directory for cookies.
Definition: Server.hpp:161
Class handling database access for threads.
Definition: Database.hpp:91
Properties of a URL list containing its namespace and name.
Definition: UrlListProperties.hpp:39
Class for JSONPath exceptions.
Definition: RegEx.hpp:108
void log(const std::string &logEntry)
Adds a log entry to the database for the current module.
Definition: Database.cpp:809
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Class for JSON exceptions.
Definition: Json.hpp:136
Class for incorrect path exceptions.
Definition: Database.hpp:752
constexpr auto optionsHttpCode
The HTTP status code for OPTIONS replies.
Definition: Server.hpp:191
constexpr auto statusHttpContentType
The HTTP content type for GET replies indicating the status of the server.
Definition: Server.hpp:182
constexpr auto cacheDir
The name of the (sub-)directory for the file cache.
Definition: Server.hpp:158
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Server settings containing its port, as well as allowed clients, origins, and actions.
Definition: ServerSettings.hpp:51
const std::string & getStatus() const
Gets the status of the server.
Definition: Server.cpp:106
Class for generic database exceptions.
Definition: Database.hpp:746
std::size_t getActiveWorkers() const
Gets the number of active worker threads.
Definition: Server.cpp:147
constexpr auto webServerPollTimeOutMs
The timeout in milliseconds for the polling of the web server.
Definition: Server.hpp:176
Database settings containing its host, port, user, password, schema, and compression.
Definition: DatabaseSettings.hpp:48
Information about a thread as stored in the database, containing both the options for and the status ...
Definition: ThreadDatabaseEntry.hpp:40
Class for wrong arguments exceptions.
Definition: Database.hpp:761
Class for date/time locale exception.
Definition: DateTime.hpp:337
ThreadStatus status
Status of the thread.
Definition: ThreadDatabaseEntry.hpp:48
constexpr auto replyHttpCode
The HTTP status code for POST replies.
Definition: Server.hpp:185
Class for XML exceptions.
Definition: XML.hpp:207
Embedded web server class using the mongoose library.
Definition: WebServer.hpp:142
constexpr auto downloadDir
The name of the (sub-)directory for downloads.
Definition: Server.hpp:164
Struct::AlgoThreadProperties AlgoThreadProperties
Definition: All.hpp:48
Class for JSONPath exceptions.
Definition: JsonPath.hpp:85
The command-and-control server.
Definition: Server.hpp:238
static ServerCommandResponse failed(const std::string &response)
Helper to initialize a "failed" response with text.
Definition: ServerCommandResponse.hpp:106
constexpr auto dictDir
The name of the (sub-)directory for dictionaries.
Definition: Server.hpp:170
std::size_t getActiveThreads() const
Gets the number of active module threads.
Definition: Server.cpp:130
std::string module
The name of the module run by the thread.
Definition: ThreadOptions.hpp:45
Properties of an algorithm thread.
Definition: AlgoThreadProperties.hpp:46
constexpr auto xmlWarningsDefault
The number of XML warnings by default.
Definition: Server.hpp:206
Namespace for the main classes of the program.
Definition: App.cpp:34
std::int64_t getUpTime() const
Gets the up-time of the server in seconds.
Definition: Server.cpp:116
std::uint64_t id
The ID of the thread.
Definition: ThreadStatus.hpp:59
constexpr auto httpsString
The beginning of URLs using the HTTPS protocol.
Definition: Server.hpp:203
Class for date/time exceptions.
Definition: DateTime.hpp:330
constexpr auto statusHttpCode
The HTTP status code for GET replies indicating the status of the server.
Definition: Server.hpp:179
Configuration properties containing its module, name, and JSON string.
Definition: ConfigProperties.hpp:40
constexpr auto debugDir
The name of the (sub-)directory for debugging.
Definition: Server.hpp:167
constexpr auto httpString
The beginning of URLs using the HTTP protocol.
Definition: Server.hpp:200
Website properties containing its domain, namespace, name, and data directory.
Definition: WebsiteProperties.hpp:39