crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Server.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Server.hpp
24  *
25  * The command-and-control server.
26  *
27  * Uses the mongoose and RapidJSON libraries to implement a HTTP server
28  * for receiving JSON-formatted commands and sending JSON-formatted
29  * replies from/to the crawlserv_frontend.
30  *
31  * Also handles all threads for the different modules as well as
32  * specific worker threads for specific server tasks.
33  *
34  * Created on: Oct 7, 2018
35  * Author: ans
36  */
37 
38 #ifndef MAIN_SERVER_HPP_
39 #define MAIN_SERVER_HPP_
40 
41 /*
42  * DEBUGGING
43  */
44 
45 // directives that allow to deactivate whole components for debugging purposes ONLY
46 #ifndef NDEBUG
47  //#define MAIN_SERVER_DEBUG_NOCRAWLERS
48  //#define MAIN_SERVER_DEBUG_NOPARSERS
49  //#define MAIN_SERVER_DEBUG_NOEXTRACTORS
50  //#define MAIN_SERVER_DEBUG_NOANALYZERS
51 #endif
52 
53 /*
54  * MACROS
55  */
56 
57 // macro for server commands
58 
59 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
60 #define MAIN_SERVER_CMD(X, Y) if(name == (X)) { \
61  response = Y(); \
62  return true; \
63  }
64 
65 
66 // macros for exception handling of worker threads
67 
68 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
69 #define MAIN_SERVER_WORKER_BEGIN try {
70 //NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
71 #define MAIN_SERVER_WORKER_END(X) } \
72  catch(const std::exception& e) { \
73  (X) = ServerCommandResponse::failed( \
74  e.what() \
75  ); \
76  }
77 
78 /*
79  * INCLUDES
80  */
81 
82 #include "Database.hpp"
83 #include "Exception.hpp"
84 #include "WebServer.hpp"
85 
86 #include "../Data/Compression/Gzip.hpp"
87 #include "../Data/Compression/Zip.hpp"
88 #include "../Data/Compression/Zlib.hpp"
89 #include "../Data/Corpus.hpp"
90 #include "../Data/File.hpp"
91 #include "../Data/ImportExport/OpenDocument.hpp"
92 #include "../Data/ImportExport/Text.hpp"
93 #include "../Helper/CommaLocale.hpp"
94 #include "../Helper/DateTime.hpp"
95 #include "../Helper/FileSystem.hpp"
96 #include "../Helper/Json.hpp"
97 #include "../Helper/Memory.hpp"
98 #include "../Helper/Strings.hpp"
99 #include "../Module/Analyzer/Algo/All.hpp"
100 #include "../Module/Crawler/Thread.hpp"
101 #include "../Module/Extractor/Thread.hpp"
102 #include "../Module/Parser/Thread.hpp"
103 #include "../Module/Thread.hpp"
104 #include "../Query/JsonPath.hpp"
105 #include "../Query/JsonPointer.hpp"
106 #include "../Query/RegEx.hpp"
107 #include "../Query/XPath.hpp"
108 #include "../Struct/AlgoThreadProperties.hpp"
109 #include "../Struct/ConfigProperties.hpp"
110 #include "../Struct/NetworkSettings.hpp"
111 #include "../Struct/QueryProperties.hpp"
112 #include "../Struct/ServerCommandResponse.hpp"
113 #include "../Struct/ServerSettings.hpp"
114 #include "../Struct/ThreadDatabaseEntry.hpp"
115 #include "../Struct/ThreadOptions.hpp"
116 #include "../Struct/UrlListProperties.hpp"
117 #include "../Struct/WebsiteProperties.hpp"
118 #include "../Timer/SimpleHR.hpp"
119 #include "../Wrapper/Database.hpp"
120 
121 #include "../_extern/jsoncons/include/jsoncons/json.hpp"
122 #include "../_extern/jsoncons/include/jsoncons_ext/jsonpath/json_query.hpp"
123 #include "../_extern/rapidjson/include/rapidjson/document.h"
124 #include "../_extern/rapidjson/include/rapidjson/prettywriter.h"
125 
126 #include <boost/lexical_cast.hpp>
127 
128 #include <algorithm> // std::count, std::count_if, std::find_if, std::remove_if
129 #include <chrono> // std::chrono
130 #include <cstddef> // std::size_t
131 #include <cstdint> // std::uint16_t, std::uint32_t, std::uint64_t
132 #include <exception> // std::exception
133 #include <iostream> // std::cout, std::flush
134 #include <memory> // std::make_unique, std::unique_ptr
135 #include <mutex> // std::lock_guard, std::mutex
136 #include <optional> // std::nullopt, std::optional
137 #include <queue> // std::queue
138 #include <sstream> // std::ostringstream
139 #include <string> // std::string, std::to_string
140 #include <string_view> // std::string_view, std::string_view_literals
141 #include <thread> // std::thread
142 #include <set> // std::set
143 #include <utility> // std::pair
144 #include <vector> // std::vector
145 
146 namespace crawlservpp::Main {
147 
148  using std::string_view_literals::operator""sv;
149 
150  /*
151  * CONSTANTS
152  */
153 
156 
158  inline constexpr auto cacheDir{"cache"sv};
159 
161  inline constexpr auto cookieDir{"cookies"sv};
162 
164  inline constexpr auto downloadDir{"dl"sv};
165 
167  inline constexpr auto debugDir{"debug"sv};
168 
170  inline constexpr auto dictDir{"dict"sv};
171 
173  inline constexpr auto mdlDir{"mdl"sv};
174 
176  inline constexpr auto webServerPollTimeOutMs{1000};
177 
179  inline constexpr auto statusHttpCode{200};
180 
182  inline constexpr auto statusHttpContentType{"text/plain"};
183 
185  inline constexpr auto replyHttpCode{200};
186 
188  inline constexpr auto replyHttpContentType{"application/json"};
189 
191  inline constexpr auto optionsHttpCode{200};
192 
194  inline constexpr auto minNameSpaceLength{3};
195 
197  inline constexpr auto minNameSpaceLengthString{"three"sv};
198 
200  inline constexpr auto httpString{"http://"sv};
201 
203  inline constexpr auto httpsString{"https://"sv};
204 
206  inline constexpr auto xmlWarningsDefault{25};
207 
209  inline constexpr auto dataTypeColumnNameSeparatorShort{"_"sv};
210 
212  inline constexpr auto dataTypeColumnNameSeparatorLong{"__"sv};
213 
215 
216  /*
217  * DECLARATION
218  */
219 
221 
238  class Server final {
239  // for convenience
245 
249 
251 
256 
268 
269  using ConnectionPtr = mg_connection *;
270  using ConstConnectionPtr = const mg_connection *;
271  using StringString = std::pair<std::string, std::string>;
272  using Queries = std::vector<std::pair<std::string, std::vector<StringString>>>;
273 
274  public:
277 
278  Server(
279  const ServerSettings& serverSettings,
280  const DatabaseSettings& databaseSettings,
281  const NetworkSettings& networkSettings
282  );
283  virtual ~Server();
284 
288 
289  [[nodiscard]] const std::string& getStatus() const;
290  [[nodiscard]] std::int64_t getUpTime() const;
291  [[nodiscard]] std::size_t getActiveThreads() const;
292  [[nodiscard]] std::size_t getActiveWorkers() const;
293 
297 
298  bool tick();
299 
301 
304 
307  Server(Server&) = delete;
308 
310  Server& operator=(Server&) = delete;
311 
313  Server(Server&&) = delete;
314 
316  Server& operator=(Server&&) = delete;
317 
319 
320  private:
321  // settings
322  ServerSettings settings;
323  DatabaseSettings dbSettings;
324  NetworkSettings netSettings;
325 
326  // database
327  Database database;
328 
329  // status
330  std::string status;
331  std::string allowed;
332  bool running{true};
333  std::chrono::steady_clock::time_point uptimeStart;
334  bool offline{true};
335 
336  // threads
337  std::vector<std::unique_ptr<Module::Crawler::Thread>> crawlers;
338  std::vector<std::unique_ptr<Module::Parser::Thread>> parsers;
339  std::vector<std::unique_ptr<Module::Extractor::Thread>> extractors;
340  std::vector<std::unique_ptr<Module::Analyzer::Thread>> analyzers;
341  std::vector<std::thread> workers;
342  std::vector<bool> workersRunning;
343  mutable std::mutex workersLock;
344 
345  // pointer to information for basic commands
346  rapidjson::Document cmdJson;
347  std::string cmdIp;
348 
349  /*
350  * NOTE: The web server needs to be declared after/destroyed before the database and any data,
351  * because it is doing one last poll on destruction!
352  */
353 
354  // web server
355  WebServer webServer;
356 
357  // setter
358  void setStatus(const std::string& statusMsg);
359 
360  // event handlers
361  void onAccept(ConnectionPtr connection);
362  void onRequest(
363  ConnectionPtr connection,
364  const std::string& method,
365  const std::string& body,
366  void * data
367  );
368 
369  // server commands
370  [[nodiscard]] std::string cmd(
371  ConnectionPtr connection,
372  const std::string& msgBody,
373  bool& threadStartedTo,
374  bool& fileDownloadTo
375  );
376  bool cmd(const std::string& name, ServerCommandResponse& response);
377 
378  [[nodiscard]] ServerCommandResponse cmdKill();
379  [[nodiscard]] ServerCommandResponse cmdAllow();
380  [[nodiscard]] ServerCommandResponse cmdDisallow();
381 
382  [[nodiscard]] ServerCommandResponse cmdLog();
383  [[nodiscard]] ServerCommandResponse cmdClearLogs();
384 
385  [[nodiscard]] ServerCommandResponse cmdStartCrawler();
386  [[nodiscard]] ServerCommandResponse cmdPauseCrawler();
387  [[nodiscard]] ServerCommandResponse cmdUnpauseCrawler();
388  [[nodiscard]] ServerCommandResponse cmdStopCrawler();
389 
390  [[nodiscard]] ServerCommandResponse cmdStartParser();
391  [[nodiscard]] ServerCommandResponse cmdPauseParser();
392  [[nodiscard]] ServerCommandResponse cmdUnpauseParser();
393  [[nodiscard]] ServerCommandResponse cmdStopParser();
394  [[nodiscard]] ServerCommandResponse cmdResetParsingStatus();
395 
396  [[nodiscard]] ServerCommandResponse cmdStartExtractor();
397  [[nodiscard]] ServerCommandResponse cmdPauseExtractor();
398  [[nodiscard]] ServerCommandResponse cmdUnpauseExtractor();
399  [[nodiscard]] ServerCommandResponse cmdStopExtractor();
400  [[nodiscard]] ServerCommandResponse cmdResetExtractingStatus();
401 
402  [[nodiscard]] ServerCommandResponse cmdStartAnalyzer();
403  [[nodiscard]] ServerCommandResponse cmdPauseAnalyzer();
404  [[nodiscard]] ServerCommandResponse cmdUnpauseAnalyzer();
405  [[nodiscard]] ServerCommandResponse cmdStopAnalyzer();
406  [[nodiscard]] ServerCommandResponse cmdResetAnalyzingStatus();
407 
408  [[nodiscard]] ServerCommandResponse cmdPauseAll();
409  [[nodiscard]] ServerCommandResponse cmdUnpauseAll();
410 
411  [[nodiscard]] ServerCommandResponse cmdAddWebsite();
412  [[nodiscard]] ServerCommandResponse cmdUpdateWebsite();
413  [[nodiscard]] ServerCommandResponse cmdDeleteWebsite();
414  [[nodiscard]] ServerCommandResponse cmdDuplicateWebsite();
415 
416  [[nodiscard]] ServerCommandResponse cmdAddUrlList();
417  [[nodiscard]] ServerCommandResponse cmdUpdateUrlList();
418  [[nodiscard]] ServerCommandResponse cmdDeleteUrlList();
419 
420  [[nodiscard]] ServerCommandResponse cmdAddQuery();
421  [[nodiscard]] ServerCommandResponse cmdUpdateQuery();
422  [[nodiscard]] ServerCommandResponse cmdMoveQuery();
423  [[nodiscard]] ServerCommandResponse cmdDeleteQuery();
424  [[nodiscard]] ServerCommandResponse cmdDuplicateQuery();
425 
426  [[nodiscard]] ServerCommandResponse cmdAddConfig();
427  [[nodiscard]] ServerCommandResponse cmdUpdateConfig();
428  [[nodiscard]] ServerCommandResponse cmdDeleteConfig();
429  [[nodiscard]] ServerCommandResponse cmdDuplicateConfig();
430 
431  [[nodiscard]] static ServerCommandResponse cmdListDicts();
432  [[nodiscard]] static ServerCommandResponse cmdListMdls();
433 
434  [[nodiscard]] ServerCommandResponse cmdWarp();
435 
436  [[nodiscard]] ServerCommandResponse cmdDownload();
437 
438  void cmdImport(ConnectionPtr connection, std::size_t threadIndex, const std::string& message);
439  void cmdMerge(ConnectionPtr connection, std::size_t threadIndex, const std::string& message);
440  void cmdExport(ConnectionPtr connection, std::size_t threadIndex, const std::string& message);
441 
442  void cmdDeleteUrls(ConnectionPtr connection, std::size_t threadIndex, const std::string& message);
443 
444  void cmdTestQuery(ConnectionPtr connection, std::size_t threadIndex, const std::string& message);
445 
446  // server initialization
447  static void initCacheDir();
448  static void initDir(std::string_view directory);
449  static void initDebuggingDir(bool isEnabled, std::string_view directory);
450  void initDatabase(std::uint16_t sleepOnSqlErrorS);
451  void initCallbacks();
452  void initWebServer(const std::string& port);
453  void initThreads();
454  void initStartLogging();
455 
456  // server cleanup
457  void clearModuleThreads();
458  void clearWorkerThreads();
459  void clearLogShutdown();
460 
461  // server tick
462  void tickPollWebServer();
463  void tickRemoveFinishedModuleThreads();
464  void tickRemoveFinishedWorkerThreads();
465  void tickReconnectIfOffline();
466 
467  // internal helper functions
468  void continueParserThread(const ThreadDatabaseEntry& entry);
469  void continueAnalyzerThread(const ThreadDatabaseEntry& entry);
470  static std::string getIp(ConstConnectionPtr connection, std::string_view function);
471  bool isWebsiteInUse(std::uint64_t website, ServerCommandResponse& responseTo) const;
472  bool isUrlListInUse(std::uint64_t urlList, ServerCommandResponse& responseTo) const;
473  bool checkConfig(std::uint64_t config, ServerCommandResponse& responseTo);
474  bool checkConfig(std::uint64_t website, std::uint64_t config, ServerCommandResponse& responseTo);
475 
476  bool getArgument(
477  const std::string& name,
478  std::string& out,
479  bool optional,
480  bool notEmpty,
481  std::string& outError
482  );
483  bool getArgument(
484  const std::string& name,
485  std::uint64_t& out,
486  std::string& outError
487  );
488  bool getArgument(
489  const std::string& name,
490  bool& out,
491  bool optional,
492  std::string& outError
493  );
494 
495  void workerEnd(
496  std::size_t threadIndex,
497  ConnectionPtr connection,
498  const std::string& message,
499  const ServerCommandResponse& response
500  );
501 
502  void initWorkerDatabase(Module::Database& db);
503 
504  // internal static helper functions
505  [[nodiscard]] static bool workerBegin(
506  const std::string& message,
507  rapidjson::Document& json,
508  ServerCommandResponse& response
509  );
510 
511  static bool getArgument(
512  const rapidjson::Document& json,
513  const std::string& name,
514  std::string& out,
515  bool optional,
516  bool notEmpty,
517  std::string& outError
518  );
519  static bool getArgument(
520  const rapidjson::Document& json,
521  const std::string& name,
522  std::uint64_t& out,
523  std::string& outError
524  );
525  static bool getArgument(
526  const rapidjson::Document& json,
527  const std::string& name,
528  bool& out,
529  bool optional,
530  std::string& outError
531  );
532 
533  static void correctDomain(std::string& inOut);
534 
535  [[nodiscard]] static bool checkNameSpace(
536  const std::string& name,
537  ServerCommandResponse& responseTo
538  );
539 
540  [[nodiscard]] static std::uint32_t getAlgoFromConfig(
541  const rapidjson::Document& json
542  );
543 
544  [[nodiscard]] static std::string generateReply(
545  const ServerCommandResponse& response,
546  const std::string& msgBody
547  );
548 
549  [[nodiscard]] static std::string dateTimeTest(
550  const std::string& input,
551  const std::string& format,
552  const std::string& locale
553  );
554 
555  static bool cmdExportGetArguments(
556  const rapidjson::Document& json,
557  std::string& dataTypeTo,
558  std::string& fileTypeTo,
559  std::string& compressionTo,
560  ServerCommandResponse& responseTo
561  );
562  static bool cmdExportRetrieveAndConvert(
563  const rapidjson::Document& json,
564  const std::string& dataType,
565  const std::string& fileType,
566  Module::Database& db,
567  std::string& contentTo,
568  ServerCommandResponse& responseTo
569  );
570  static bool cmdExportCompress(
571  const std::string& dataType,
572  const std::string& compression,
573  std::string contentInOut,
574  ServerCommandResponse& responseTo
575  );
576  static void cmdExportWrite(
577  const std::string& content,
578  ServerCommandResponse& responseTo
579  );
580  static void cmdExportLogSuccess(
581  Module::Database& db,
582  std::size_t size,
583  const std::string& timeString
584  );
585  static bool cmdExportRetrieveUrlList(
586  const rapidjson::Document& json,
587  Module::Database& db,
588  std::queue<std::string>& urlsTo,
589  ServerCommandResponse& responseTo
590  );
591  static bool cmdExportRetrieveTable(
592  std::string_view type,
593  const rapidjson::Document& json,
594  Module::Database& db,
595  std::string& nameTo,
596  std::vector<std::vector<std::string>>& contentTo,
597  bool& isColumnNamesTo,
598  ServerCommandResponse& responseTo
599  );
600  static bool cmdExportRetrieveCorpus(
601  const rapidjson::Document& json,
602  Module::Database& db,
603  std::queue<std::string>& urlsTo,
604  ServerCommandResponse& responseTo
605  );
606  static bool cmdExportGetUrlListArguments(
607  const rapidjson::Document& json,
608  std::uint64_t& websiteTo,
609  std::uint64_t& urlListTo,
610  ServerCommandResponse& responseTo
611  );
612  static bool cmdExportGetTableArguments(
613  const rapidjson::Document& json,
614  std::uint64_t& websiteTo,
615  std::uint64_t& urlListTo,
616  std::uint64_t& sourceTableTo,
617  bool& isColumnNamesTo,
618  ServerCommandResponse& responseTo
619  );
620  static bool cmdExportGetCorpusArguments(
621  const rapidjson::Document& json,
622  std::uint64_t& corpusTo,
623  std::string& whatTo,
624  ServerCommandResponse& responseTo
625  );
626  static bool cmdExportCheckWebsiteUrlList(
627  Module::Database& db,
628  std::uint64_t websiteId,
629  std::uint64_t urlListId,
630  ServerCommandResponse& responseTo
631  );
632  static bool cmdExportCheckTargetTable(
633  Module::Database& db,
634  std::string_view dataType,
635  std::uint64_t websiteId,
636  std::uint64_t urlListId,
637  std::uint64_t tableId,
638  ServerCommandResponse& responseTo
639  );
640  static bool cmdExportCheckCorpus(
641  Module::Database& db,
642  std::uint64_t firstId,
643  std::string_view what,
644  ServerCommandResponse& responseTo
645  );
646  static void cmdExportGetTableContent(
647  Module::Database& db,
648  std::string_view dataType,
649  std::uint64_t websiteId,
650  std::uint64_t urlListId,
651  std::uint64_t tableId,
652  std::string& nameTo,
653  std::vector<std::vector<std::string>>& contentTo,
654  bool isIncludeColumnNames
655  );
656  [[nodiscard]] static std::string cmdExportGetCorpus(
657  Module::Database& db,
658  std::uint64_t firstChunkId
659  );
660  [[nodiscard]] static std::queue<std::string> cmdExportGetKeysFromCorpusMaps(
661  Module::Database& db,
662  std::uint64_t firstChunkId,
663  std::string_view what
664  );
665  static void cmdExportGetKeysFromCorpusMap(
666  const std::string& map,
667  std::queue<std::string>& appendKeysTo
668  );
669  static void cmdExportRemoveColumnPrefixes(
670  std::string_view type,
671  std::vector<std::vector<std::string>>& content
672  );
673  static void cmdExportLog(
674  Module::Database& db,
675  std::string_view entryType,
676  std::string_view entryTypes,
677  std::string_view listType,
678  std::uint64_t entryNum
679  );
680  static bool cmdExportListAsText(
681  const rapidjson::Document& json,
682  std::queue<std::string>& data,
683  std::string& contentTo,
684  ServerCommandResponse& responseTo
685  );
686  static bool cmdExportListAsSpreadsheet(
687  const rapidjson::Document& json,
688  std::queue<std::string>& data,
689  std::string& contentTo,
690  ServerCommandResponse& responseTo
691  );
692  static void cmdExportTableAsSpreadsheet(
693  const std::string& tableName,
694  const std::vector<std::vector<std::string>>& tableContent,
695  std::string& contentTo,
696  bool isColumnNames
697  );
698  static bool cmdExportGetFirstLineHeader(
699  const rapidjson::Document& json,
700  std::optional<std::string>& optHeaderTo,
701  ServerCommandResponse& responseTo
702  );
703  static bool cmdDeleteUrlsGetArguments(
704  const rapidjson::Document& json,
705  std::uint64_t& urlListTo,
706  std::uint64_t& queryTo,
707  ServerCommandResponse& responseTo
708  );
709  static bool cmdDeleteUrlsGetWebsite(
710  Module::Database& db,
711  std::uint64_t urlList,
712  std::uint64_t& websiteTo,
713  ServerCommandResponse& responseTo
714  );
715  static bool cmdDeleteUrlsGetQuery(
716  Module::Database& db,
717  std::uint64_t query,
718  std::string& regExTo,
719  ServerCommandResponse& responseTo
720  );
721  static bool cmdDeleteUrlsGetUrls(
722  Module::Database& db,
723  std::uint64_t urlList,
724  const std::string& regEx,
725  std::queue<std::uint64_t>& toDeleteTo,
726  ServerCommandResponse& responseTo
727  );
728  static ServerCommandResponse cmdDeleteUrlsDelete(
729  Module::Database& db,
730  std::uint64_t urlList,
731  std::queue<std::uint64_t>& toDelete
732  );
733  static ServerCommandResponse cmdDeleteUrlsConfirm(std::size_t number);
734 
735  // static template helper functions for different kinds of threats
736  template<typename T> static void interruptModuleThreads(
737  std::vector<std::unique_ptr<T>>& threads
738  ) {
739  for(auto& thread : threads) {
740  thread->Module::Thread::interrupt();
741  }
742  }
743 
744  template<typename T> static void waitForModuleThreads(
745  std::vector<std::unique_ptr<T>>& threads,
746  std::string_view moduleName,
747  std::queue<std::string>& logEntriesTo
748  ) {
749  for(auto& thread : threads) {
750  if(thread) {
751  // save the ID of the thread before ending it
752  const auto id{thread->getId()};
753 
754  // wait for thread
755  thread->Module::Thread::end();
756 
757  // log interruption
758  std::string logString{moduleName};
759 
760  logString += " #";
761  logString += std::to_string(id);
762  logString += " interrupted.";
763 
764  logEntriesTo.emplace(logString);
765  }
766  }
767 
768  threads.clear();
769  }
770 
771  template<typename T> static std::size_t countModuleThreads(
772  const std::vector<std::unique_ptr<T>>& threads
773  ) {
774  return std::count_if(
775  threads.cbegin(),
776  threads.cend(),
777  [](const auto& thread) {
778  return thread->isRunning();
779  }
780  );
781  }
782 
783  template<typename T> static void removeFinishedModuleThreads(
784  std::vector<std::unique_ptr<T>>& threads
785  ) {
786  threads.erase(
787  std::remove_if(
788  threads.begin(),
789  threads.end(),
790  [](auto& crawler) {
791  if(crawler->isShutdown() && crawler->isFinished()) {
792  crawler->Module::Thread::end();
793 
794  return true;
795  }
796 
797  return false;
798  }
799  ),
800  threads.end()
801  );
802  }
803 
804  template<typename T> void continueModuleThread( /* (for analyzer and extractor only) */
805  const ThreadDatabaseEntry& entry,
806  std::vector<std::unique_ptr<T>>& to
807  ) {
808  to.push_back(
809  std::make_unique<T>(
810  this->database,
811  cookieDir,
812  entry.options,
813  this->netSettings,
814  entry.status
815  )
816  );
817 
818  to.back()->Module::Thread::start();
819 
820  // write to log
821  this->database.log(
822  entry.options.module
823  + " #"
824  + std::to_string(entry.status.id)
825  + " continued."
826  );
827  }
828 
829  // static template helper functions for different kinds of database connections
830  template<typename DB> static bool checkWebsite(
831  DB& db,
832  std::uint64_t website,
833  ServerCommandResponse& responseTo
834  ) {
835  if(db.isWebsite(website)) {
836  return true;
837  }
838 
839  responseTo = ServerCommandResponse::failed(
840  "Website #"
841  + std::to_string(website)
842  + " not found."
843  );
844 
845  return false;
846  }
847 
848  template<typename DB> static bool checkUrlList(
849  DB& db,
850  std::uint64_t urlList,
851  ServerCommandResponse& responseTo
852  ) {
853  if(db.isUrlList(urlList)) {
854  return true;
855  }
856 
857  responseTo = ServerCommandResponse::failed(
858  "URL list #"
859  + std::to_string(urlList)
860  + " not found."
861  );
862 
863  return false;
864  }
865 
866  template<typename DB> static bool checkUrlList(
867  DB& db,
868  std::uint64_t website,
869  std::uint64_t urlList,
870  ServerCommandResponse& responseTo
871  ) {
872  if(db.isUrlList(website, urlList)) {
873  return true;
874  }
875 
876  responseTo = ServerCommandResponse::failed(
877  "URL list #"
878  + std::to_string(urlList)
879  + " for website #"
880  + std::to_string(website)
881  + " not found."
882  );
883 
884  return false;
885  }
886 
887  template<typename DB> static bool checkQuery(
888  DB& db,
889  std::uint64_t queryId,
890  ServerCommandResponse& responseTo
891  ) {
892  if(db.isQuery(queryId)) {
893  return true;
894  }
895 
896  responseTo = ServerCommandResponse::failed(
897  "Query #"
898  + std::to_string(queryId)
899  + " not found."
900  );
901 
902  return false;
903  }
904 
905  template<typename DB> static bool checkQuery(
906  DB& db,
907  std::uint64_t website,
908  std::uint64_t queryId,
909  ServerCommandResponse& responseTo
910  ) {
911  if(db.isQuery(website, queryId)) {
912  return true;
913  }
914 
915  responseTo = ServerCommandResponse::failed(
916  "Query #"
917  + std::to_string(queryId)
918  + " for website #"
919  + std::to_string(website)
920  + " not found."
921  );
922 
923  return false;
924  }
925 
926  template<typename DB> static bool checkCorpus(
927  DB& db,
928  std::uint64_t firstId,
929  bool requireArticleMap,
930  bool requireDateMap,
931  ServerCommandResponse& responseTo
932  ) {
933  if(db.isCorpus(firstId, requireArticleMap, requireDateMap)) {
934  return true;
935  }
936 
937  auto error{"Corpus starting at #" + std::to_string(firstId)};
938 
939  if(requireArticleMap && db.isCorpus(firstId, false, false)) {
940  error += " does not include articles.";
941  }
942  else if(requireDateMap && db.isCorpus(firstId, false, false)) {
943  error += " does not include dates.";
944  }
945  else {
946  error += " not found.";
947  }
948 
949  responseTo = ServerCommandResponse::failed(error);
950 
951  return false;
952  }
953  };
954 
955 } /* namespace crawlservpp::Main */
956 
957 #endif /* MAIN_SERVER_HPP_ */
Class for storage engine exceptions.
Definition: Database.hpp:755
Class for XPath exceptions.
Definition: XPath.hpp:104
constexpr auto mdlDir
The name of the (sub-)directory for language models.
Definition: Server.hpp:173
Query properties containing its name, text, type, and result type(s).
Definition: QueryProperties.hpp:39
Class for JSONPointer exceptions.
Definition: JsonPointer.hpp:97
Response from the command-and-control server.
Definition: ServerCommandResponse.hpp:40
Class for insufficient privileges exceptions.
Definition: Database.hpp:758
virtual ~Server()
Destructor interrupting and waiting for all threads.
Definition: Server.cpp:91
constexpr auto minNameSpaceLengthString
The minimum length of namespaces, as string.
Definition: Server.hpp:197
Server(const ServerSettings &serverSettings, const DatabaseSettings &databaseSettings, const NetworkSettings &networkSettings)
Constructor setting server, database, and network settings.
Definition: Server.cpp:64
constexpr auto minNameSpaceLength
The minimum length of namespaces.
Definition: Server.hpp:194
Server & operator=(Server &)=delete
Deleted copy assignment operator.
ThreadOptions options
Options for the thread.
Definition: ThreadDatabaseEntry.hpp:45
constexpr auto dataTypeColumnNameSeparatorLong
The (long) separator used between data type and column name.
Definition: Server.hpp:212
constexpr auto dataTypeColumnNameSeparatorShort
The (short) separator used between data type and column name.
Definition: Server.hpp:209
bool tick()
Perform a server tick.
Definition: Server.cpp:174
constexpr auto replyHttpContentType
The HTTP content type for POST replies.
Definition: Server.hpp:188
Network settings containing the default proxy as well as host, port, and password of the TOR control ...
Definition: NetworkSettings.hpp:49
constexpr auto cookieDir
The name of the (sub-)directory for cookies.
Definition: Server.hpp:161
Class handling database access for threads.
Definition: Database.hpp:91
Properties of a URL list containing its namespace and name.
Definition: UrlListProperties.hpp:39
Class for JSONPath exceptions.
Definition: RegEx.hpp:108
void log(const std::string &logEntry)
Adds a log entry to the database for the current module.
Definition: Database.cpp:809
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Class for JSON exceptions.
Definition: Json.hpp:136
Class for incorrect path exceptions.
Definition: Database.hpp:752
constexpr auto optionsHttpCode
The HTTP status code for OPTIONS replies.
Definition: Server.hpp:191
constexpr auto statusHttpContentType
The HTTP content type for GET replies indicating the status of the server.
Definition: Server.hpp:182
constexpr auto cacheDir
The name of the (sub-)directory for the file cache.
Definition: Server.hpp:158
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Server settings containing its port, as well as allowed clients, origins, and actions.
Definition: ServerSettings.hpp:51
const std::string & getStatus() const
Gets the status of the server.
Definition: Server.cpp:106
Class for generic database exceptions.
Definition: Database.hpp:746
std::size_t getActiveWorkers() const
Gets the number of active worker threads.
Definition: Server.cpp:147
constexpr auto webServerPollTimeOutMs
The timeout in milliseconds for the polling of the web server.
Definition: Server.hpp:176
Database settings containing its host, port, user, password, schema, and compression.
Definition: DatabaseSettings.hpp:48
Information about a thread as stored in the database, containing both the options for and the status ...
Definition: ThreadDatabaseEntry.hpp:40
Class for wrong arguments exceptions.
Definition: Database.hpp:761
Class for date/time locale exception.
Definition: DateTime.hpp:337
ThreadStatus status
Status of the thread.
Definition: ThreadDatabaseEntry.hpp:48
constexpr auto replyHttpCode
The HTTP status code for POST replies.
Definition: Server.hpp:185
Class for XML exceptions.
Definition: XML.hpp:207
Embedded web server class using the mongoose library.
Definition: WebServer.hpp:142
constexpr auto downloadDir
The name of the (sub-)directory for downloads.
Definition: Server.hpp:164
Struct::AlgoThreadProperties AlgoThreadProperties
Definition: All.hpp:48
Class for JSONPath exceptions.
Definition: JsonPath.hpp:85
The command-and-control server.
Definition: Server.hpp:238
static ServerCommandResponse failed(const std::string &response)
Helper to initialize a "failed" response with text.
Definition: ServerCommandResponse.hpp:106
constexpr auto dictDir
The name of the (sub-)directory for dictionaries.
Definition: Server.hpp:170
std::size_t getActiveThreads() const
Gets the number of active module threads.
Definition: Server.cpp:130
std::string module
The name of the module run by the thread.
Definition: ThreadOptions.hpp:45
Properties of an algorithm thread.
Definition: AlgoThreadProperties.hpp:46
constexpr auto xmlWarningsDefault
The number of XML warnings by default.
Definition: Server.hpp:206
Namespace for the main classes of the program.
Definition: App.cpp:34
std::int64_t getUpTime() const
Gets the up-time of the server in seconds.
Definition: Server.cpp:116
std::uint64_t id
The ID of the thread.
Definition: ThreadStatus.hpp:59
constexpr auto httpsString
The beginning of URLs using the HTTPS protocol.
Definition: Server.hpp:203
Class for date/time exceptions.
Definition: DateTime.hpp:330
constexpr auto statusHttpCode
The HTTP status code for GET replies indicating the status of the server.
Definition: Server.hpp:179
Configuration properties containing its module, name, and JSON string.
Definition: ConfigProperties.hpp:40
constexpr auto debugDir
The name of the (sub-)directory for debugging.
Definition: Server.hpp:167
constexpr auto httpString
The beginning of URLs using the HTTP protocol.
Definition: Server.hpp:200
Website properties containing its domain, namespace, name, and data directory.
Definition: WebsiteProperties.hpp:39