crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Config.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Config.hpp
24  *
25  * Crawling configuration.
26  *
27  * Created on: Oct 25, 2018
28  * Author: ans
29  */
30 
31 #ifndef MODULE_CRAWLER_CONFIG_HPP_
32 #define MODULE_CRAWLER_CONFIG_HPP_
33 
34 #include "../../Main/Exception.hpp"
35 #include "../../Network/Config.hpp"
36 
37 #include <algorithm> // std::min
38 #include <cstddef> // std::size_t
39 #include <cstdint> // std::int32_t, std::int64_t, std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t
40 #include <string> // std::string
41 #include <vector> // std::vector
42 
45 
46  /*
47  * CONSTANTS
48  */
49 
52 
54  inline constexpr std::uint8_t crawlerLoggingSilent{0};
55 
57  inline constexpr std::uint8_t crawlerLoggingDefault{1};
58 
60  inline constexpr std::uint8_t crawlerLoggingExtended{2};
61 
63  inline constexpr std::uint8_t crawlerLoggingVerbose{3};
64 
66  inline constexpr std::uint8_t redirectSourceUrl{0};
67 
69  inline constexpr std::uint8_t redirectSourceContent{1};
70 
72  inline constexpr std::uint32_t defaultCrawlerLockS{300};
73 
75  inline constexpr std::uint16_t defaultMaxBatchSize{500};
76 
78  inline constexpr std::int32_t defaultRestartAfter{-1};
79 
81  inline constexpr std::int64_t defaultReTries{720};
82 
84  inline constexpr std::array defaultRetryHttp{429, 502, 503, 504, 521, 522, 524};
85 
87  inline constexpr std::uint64_t defaultSleepErrorMs{10000};
88 
90  inline constexpr std::uint64_t defaultSleepHttpMs{0};
91 
93  inline constexpr std::uint64_t defaultSleepIdleMs{5000};
94 
96  inline constexpr std::uint64_t defaultSleepMySqlS{60};
97 
99  inline constexpr std::uint64_t defaultUrlChunks{5000};
100 
102  inline constexpr std::uint16_t defaultUrlMaxLength{2000};
103 
105 
106  /*
107  * DECLARATION
108  */
109 
111  class Config : protected Network::Config {
112  public:
113 
116 
117  void setCrossDomain(bool isCrossDomain);
118 
120 
123 
125 
130  struct Entries {
133 
135  bool crawlerArchives{false};
136 
138  std::vector<std::string> crawlerArchivesNames{"archives.org"};
139 
141 
145  bool crawlerArchivesOnly{false};
146 
148 
152  std::vector<std::string> crawlerArchivesUrlsMemento{"http://web.archive.org/web/"};
153 
155  std::vector<std::string> crawlerArchivesUrlsSkip;
156 
158 
162  std::vector<std::string> crawlerArchivesUrlsTimemap{"http://web.archive.org/web/timemap/link/"};
163 
166 
168 
173 
176 
178 
182  std::vector<std::string> crawlerParamsAdd;
183 
185 
189  std::vector<std::string> crawlerParamsBlackList;
190 
192 
196  std::vector<std::string> crawlerParamsWhiteList;
197 
199 
204  std::vector<std::uint64_t> crawlerQueriesBlackListContent;
205 
207 
212  std::vector<std::uint64_t> crawlerQueriesBlackListTypes;
213 
215 
220  std::vector<std::uint64_t> crawlerQueriesBlackListUrls;
221 
223  std::vector<std::uint64_t> crawlerQueriesLinks;
224 
226 
231  std::vector<std::uint64_t> crawlerQueriesLinksBlackListContent;
232 
234 
239  std::vector<std::uint64_t> crawlerQueriesLinksBlackListTypes;
240 
242 
247  std::vector<std::uint64_t> crawlerQueriesLinksBlackListUrls;
248 
250 
255  std::vector<std::uint64_t> crawlerQueriesLinksWhiteListContent;
256 
258 
263  std::vector<std::uint64_t> crawlerQueriesLinksWhiteListTypes;
264 
266 
271  std::vector<std::uint64_t> crawlerQueriesLinksWhiteListUrls;
272 
274 
279  std::vector<std::uint64_t> crawlerQueriesWhiteListContent;
280 
282 
287  std::vector<std::uint64_t> crawlerQueriesWhiteListTypes;
288 
290 
295  std::vector<std::uint64_t> crawlerQueriesWhiteListUrls;
296 
298  bool crawlerReCrawl{false};
299 
301  std::vector<std::string> crawlerReCrawlAlways;
302 
305 
307  bool crawlerRepairCData{true};
308 
311 
314 
317 
320 
323 
325  bool crawlerRetryEmpty{true};
326 
328  std::vector<std::uint32_t> crawlerRetryHttp{defaultRetryHttp.cbegin(), defaultRetryHttp.cend()};
329 
332 
335 
338 
340 
345 
347  std::string crawlerStart{"/"};
348 
350  bool crawlerStartIgnore{false};
351 
353  std::uint32_t crawlerTidyErrors{};
354 
356  bool crawlerTidyWarnings{false};
357 
359  bool crawlerTiming{false};
360 
362 
366 
369 
371 
374  bool crawlerUrlDebug{false};
375 
378 
381 
383  bool crawlerWarningsFile{false};
384 
386  bool crawlerXml{false};
387 
391 
393  std::vector<std::string> customCounters;
394 
396  std::vector<std::string> customCountersAlias;
397 
399  std::vector<std::uint64_t> customCountersAliasAdd;
400 
402  std::vector<std::int64_t> customCountersEnd;
403 
405 
409 
411  std::vector<std::int64_t> customCountersStart;
412 
414  std::vector<std::int64_t> customCountersStep;
415 
417  bool customReCrawl{true};
418 
420  bool customRobots{false};
421 
423  std::vector<std::string> customTokenHeaders;
424 
426  std::vector<std::string> customTokens;
427 
429  std::vector<std::string> customTokensCookies;
430 
432  std::vector<std::uint32_t> customTokensKeep;
433 
435  std::vector<std::uint64_t> customTokensQuery;
436 
438  std::vector<bool> customTokensRequired;
439 
441  std::vector<std::string> customTokensSource;
442 
444  std::vector<bool> customTokensUsePost;
445 
447  std::vector<std::string> customUrls;
448 
450 
455  bool customUsePost{false};
456 
458 
461 
464 
467 
469  std::uint64_t expectedQuery{};
470 
474 
476  std::string redirectCookies;
477 
479  std::vector<std::string> redirectHeaders;
480 
482  std::uint64_t redirectQueryContent{};
483 
485  std::uint64_t redirectQueryUrl{};
486 
488  std::string redirectTo;
489 
491  bool redirectUsePost{false};
492 
494 
500  std::vector<std::string> redirectVarNames;
501 
503  std::vector<std::uint64_t> redirectVarQueries;
504 
506 
510  std::vector<std::uint8_t> redirectVarSources;
511 
513  }
514 
516  config;
517 
519 
522 
523  protected:
526 
527  void parseOption() override;
528  void checkOptions() override;
529  void reset() override;
530 
532 
533  private:
534  bool crossDomain{false};
535  };
536 
537  /*
538  * IMPLEMENTATION
539  */
540 
541  /*
542  * SETTER
543  */
544 
546 
550  inline void Config::setCrossDomain(bool isCrossDomain) {
551  this->crossDomain = isCrossDomain;
552  }
553 
554  /*
555  * CRAWLER-SPECIFIC CONFIGURATION PARSING
556  */
557 
559  inline void Config::parseOption() {
560  // crawler options
561  this->category("crawler");
562  this->option("archives", this->config.crawlerArchives);
563  this->option("archives.names", this->config.crawlerArchivesNames);
564  this->option("archives.only", this->config.crawlerArchivesOnly);
565  this->option(
566  "archives.urls.memento",
568  StringParsingOption::Trim
569  );
570  this->option("archives.urls.skip", this->config.crawlerArchivesUrlsSkip);
571  this->option("archives.urls.timemap", this->config.crawlerArchivesUrlsTimemap);
572  this->option("lock", this->config.crawlerLock);
573  this->option("logging", this->config.crawlerLogging);
574  this->option("max.batch.size", this->config.crawlerMaxBatchSize);
575  this->option("params.add", this->config.crawlerParamsAdd);
576  this->option("params.blacklist", this->config.crawlerParamsBlackList);
577  this->option("params.whitelist", this->config.crawlerParamsWhiteList);
578  this->option("queries.blacklist.cont", this->config.crawlerQueriesBlackListContent);
579  this->option("queries.blacklist.types", this->config.crawlerQueriesBlackListTypes);
580  this->option("queries.blacklist.urls", this->config.crawlerQueriesBlackListUrls);
581  this->option("queries.links", this->config.crawlerQueriesLinks);
582  this->option("queries.links.blacklist.cont", this->config.crawlerQueriesLinksBlackListContent);
583  this->option("queries.links.blacklist.types", this->config.crawlerQueriesLinksBlackListTypes);
584  this->option("queries.links.blacklist.urls", this->config.crawlerQueriesLinksBlackListUrls);
585  this->option("queries.links.whitelist.cont", this->config.crawlerQueriesLinksWhiteListContent);
586  this->option("queries.links.whitelist.types", this->config.crawlerQueriesLinksWhiteListTypes);
587  this->option("queries.links.whitelist.urls", this->config.crawlerQueriesLinksWhiteListUrls);
588  this->option("queries.whitelist.cont", this->config.crawlerQueriesWhiteListContent);
589  this->option("queries.whitelist.types", this->config.crawlerQueriesWhiteListTypes);
590  this->option("queries.whitelist.urls", this->config.crawlerQueriesWhiteListUrls);
591  this->option("recrawl", this->config.crawlerReCrawl);
592  this->option("recrawl.always", this->config.crawlerReCrawlAlways);
593  this->option("recrawl.start", this->config.crawlerReCrawlStart);
594  this->option("remove.xml.instructions", this->config.crawlerRemoveXmlInstructions);
595  this->option("repair.cdata", this->config.crawlerRepairCData);
596  this->option("repair.comments", this->config.crawlerRepairComments);
597  this->option("restart.after", this->config.crawlerRestartAfter);
598  this->option("retries", this->config.crawlerReTries);
599  this->option("retry.archive", this->config.crawlerRetryArchive);
600  this->option("retry.empty", this->config.crawlerRetryEmpty);
601  this->option("retry.http", this->config.crawlerRetryHttp);
602  this->option("sleep.error", this->config.crawlerSleepError);
603  this->option("sleep.http", this->config.crawlerSleepHttp);
604  this->option("sleep.idle", this->config.crawlerSleepIdle);
605  this->option("sleep.mysql", this->config.crawlerSleepMySql);
606  this->option(
607  "start",
608  this->config.crawlerStart,
609  this->crossDomain ?
610  StringParsingOption::URL : StringParsingOption::SubURL
611  );
612  this->option("start.ignore", this->config.crawlerStartIgnore);
613  this->option("tidy.errors", this->config.crawlerTidyErrors);
614  this->option("tidy.warnings", this->config.crawlerTidyWarnings);
615  this->option("timing", this->config.crawlerTiming);
616  this->option("url.case.sensitive", this->config.crawlerUrlCaseSensitive);
617  this->option("url.chunks", this->config.crawlerUrlChunks);
618  this->option("url.debug", this->config.crawlerUrlDebug);
619  this->option("url.max.length", this->config.crawlerUrlMaxLength);
620  this->option("url.startup.check", this->config.crawlerUrlStartupCheck);
621  this->option("xml", this->config.crawlerXml);
622  this->option("warnings.file", this->config.crawlerWarningsFile);
623 
624  // custom URLs options
625  this->category("custom");
626  this->option("counters", this->config.customCounters);
627  this->option("counters.alias", this->config.customCountersAlias);
628  this->option("counters.alias.add", this->config.customCountersAliasAdd);
629  this->option("counters.end", this->config.customCountersEnd);
630  this->option("counters.global", this->config.customCountersGlobal);
631  this->option("counters.start", this->config.customCountersStart);
632  this->option("counters.step", this->config.customCountersStep);
633  this->option("recrawl", this->config.customReCrawl);
634  this->option("robots", this->config.customRobots);
635  this->option("tokens", this->config.customTokens);
636  this->option("tokens.cookies", this->config.customTokensCookies);
637  this->option("tokens.keep", this->config.customTokensKeep);
638  this->option("tokens.query", this->config.customTokensQuery);
639  this->option("tokens.required", this->config.customTokensRequired);
640  this->option("tokens.source", this->config.customTokensSource);
641  this->option("tokens.use.post", this->config.customTokensUsePost);
642  this->option("token.headers", this->config.customTokenHeaders); // NOTE: to be used for ALL tokens
643  this->option(
644  "urls",
645  this->config.customUrls,
646  this->crossDomain ?
647  StringParsingOption::URL : StringParsingOption::SubURL
648  );
649  this->option("use.post", this->config.customUsePost);
650 
651  // dynamic redirect options
652  this->category("redirect");
653  this->option("cookies", this->config.redirectCookies);
654  this->option("headers", this->config.redirectHeaders);
655  this->option("query.content", this->config.redirectQueryContent);
656  this->option("query.url", this->config.redirectQueryUrl);
657  this->option("to", this->config.redirectTo);
658  this->option("use.post", this->config.redirectUsePost);
659  this->option("var.names", this->config.redirectVarNames);
660  this->option("var.queries", this->config.redirectVarQueries);
661  this->option("var.sources", this->config.redirectVarSources);
662 
663  // expected number of results options
664  this->category("expected");
665  this->option("query", this->config.expectedQuery);
666  this->option("error.if.larger", this->config.expectedErrorIfLarger);
667  this->option("error.if.smaller", this->config.expectedErrorIfSmaller);
668  }
669 
671 
676  inline void Config::checkOptions() {
677  // check for link extraction query
678  if(this->config.crawlerQueriesLinks.empty()) {
679  throw Exception(
680  "Crawler::Config::checkOptions():"
681  " No link extraction query has been specified"
682  );
683  }
684 
685  // check number of URLs to crawl at once
686  if(this->config.crawlerUrlChunks == 0) {
688 
689  this->warning(
690  "Invalid value for 'url.chunks' ignored (was zero),"
691  "default value used"
692  );
693  }
694 
695  // check properties of archives
696  bool incompleteArchives{false};
697 
698  const auto completeArchives{
699  std::min({ // number of complete archives (= min. size of all arrays)
700  this->config.crawlerArchivesNames.size(),
701  this->config.crawlerArchivesUrlsMemento.size(),
703  })
704  };
705 
706  // remove names that are not used
707  if(this->config.crawlerArchivesNames.size() > completeArchives) {
708  this->config.crawlerArchivesNames.resize(completeArchives);
709 
710  incompleteArchives = true;
711  }
712 
713  // remove memento URL templates that are not used
714  if(this->config.crawlerArchivesUrlsMemento.size() > completeArchives) {
715  this->config.crawlerArchivesUrlsMemento.resize(completeArchives);
716 
717  incompleteArchives = true;
718  }
719 
720  // remove timemap URL templates that are not used
721  if(this->config.crawlerArchivesUrlsTimemap.size() > completeArchives) {
722  this->config.crawlerArchivesUrlsTimemap.resize(completeArchives);
723 
724  incompleteArchives = true;
725  }
726 
727  // warn about incomplete archives
728  if(incompleteArchives) {
729  this->warning(
730  "'archives.names', '.urls.memento' and '.urls.timemap'"
731  " should have the same number of elements."
732  );
733 
734  this->warning("Incomplete archive(s) removed from configuration.");
735  }
736 
737  // check properties of counters
738  bool incompleteCounters{false};
739 
740  const auto completeCounters{
741  std::min({ // number of complete counters (= min. size of arrays)
742  this->config.customCounters.size(),
743  this->config.customCountersStart.size(),
744  this->config.customCountersEnd.size()
745  })
746  };
747 
748  // remove counter variable names that are not used
749  if(this->config.customCounters.size() > completeCounters) {
750  // remove counter variables of incomplete counters
751  this->config.customCounters.resize(completeCounters);
752 
753  incompleteCounters = true;
754  }
755 
756  // remove starting values that are not used
757  if(this->config.customCountersStart.size() > completeCounters) {
758  this->config.customCountersStart.resize(completeCounters);
759 
760  incompleteCounters = true;
761  }
762 
763  // remove ending values that are not used
764  if(this->config.customCountersEnd.size() > completeCounters) {
765  this->config.customCountersEnd.resize(completeCounters);
766 
767  incompleteCounters = true;
768  }
769 
770  // warn about incomplete counters
771  if(incompleteCounters) {
772  this->warning(
773  "'custom.counters', '.counters.start',"
774  " '.counters.end' and '.counters.step'"
775  " should have the same number of elements."
776  );
777 
778  this->warning("Incomplete counter(s) removed from configuration.");
779 
780  incompleteCounters = false;
781  }
782 
783  // remove step values that are not used, add one as step value where none is specified
784  if(this->config.customCountersStep.size() > completeCounters) {
785  incompleteCounters = true;
786  }
787 
788  this->config.customCountersStep.resize(completeCounters, 1);
789 
790  // remove aliases that are not used, add empty aliases where none exist
791  if(this->config.customCountersAlias.size() > completeCounters) {
792  incompleteCounters = true;
793  }
794 
795  this->config.customCountersAlias.resize(completeCounters);
796 
797  // remove alias summands that are not used, add zero as summand where none is specified
798  if(this->config.customCountersAliasAdd.size() > completeCounters) {
799  incompleteCounters = true;
800  }
801 
802  this->config.customCountersAliasAdd.resize(completeCounters, 0);
803 
804  // warn about unused properties
805  if(incompleteCounters) {
806  this->warning("Unused counter properties removed from configuration.");
807  }
808 
809  // check validity of counters
810  // (infinite counters are invalid, therefore the need to check for counter termination)
811  for(std::size_t n{1}; n <= this->config.customCounters.size(); ++n) {
812  const auto index{n - 1};
813 
814  if(
815  (
816  this->config.customCountersStep.at(index) <= 0
817  && this->config.customCountersStart.at(index)
818  < this->config.customCountersEnd.at(index)
819  )
820  ||
821  (
822  this->config.customCountersStep.at(index) >= 0
823  && this->config.customCountersStart.at(index)
824  > this->config.customCountersEnd.at(index)
825  )
826  ) {
827  const std::string counterName(this->config.customCounters.at(index));
828 
829  // delete the invalid counter
830  this->config.customCounters.erase(this->config.customCounters.begin() + index);
831  this->config.customCountersStart.erase(this->config.customCountersStart.begin() + index);
832  this->config.customCountersEnd.erase(this->config.customCountersEnd.begin() + index);
833  this->config.customCountersStep.erase(this->config.customCountersStep.begin() + index);
834  this->config.customCountersAlias.erase(this->config.customCountersAlias.begin() + index);
835  this->config.customCountersAliasAdd.erase(this->config.customCountersAliasAdd.begin() + index);
836 
837  --n;
838 
839  this->warning(
840  "Loop of counter '"
841  + counterName
842  + "' would be infinite, counter removed."
843  );
844  }
845  }
846 
847  // check properties of tokens
848  bool incompleteTokens{false};
849 
850  const auto completeTokens{
851  std::min({ // number of complete tokens (= min. size of arrays)
852  this->config.customTokens.size(),
853  this->config.customTokensSource.size(),
854  this->config.customTokensQuery.size()
855  })
856  };
857 
858  // remove token variable names that are not used
859  if(this->config.customTokens.size() > completeTokens) {
860  this->config.customTokens.resize(completeTokens);
861 
862  incompleteTokens = true;
863  }
864 
865  // remove token sources that are not used
866  if(this->config.customTokensSource.size() > completeTokens) {
867  this->config.customTokensSource.resize(completeTokens);
868 
869  incompleteTokens = true;
870  }
871 
872  // remove token queries that are not used
873  if(this->config.customTokensQuery.size() > completeTokens) {
874  this->config.customTokensQuery.resize(completeTokens);
875 
876  incompleteTokens = true;
877  }
878 
879  // warn about incomplete counters
880  if(incompleteTokens) {
881  this->warning(
882  "'custom.tokens', '.tokens.source' and '.tokens.query'"
883  " should have the same number of elements."
884  );
885 
886  this->warning("Incomplete token(s) removed from configuration.");
887 
888  incompleteTokens = false;
889  }
890 
891  // remove cookie headers that are not used, set to empty string where none is specified
892  if(this->config.customTokensCookies.size() > completeTokens) {
893  incompleteTokens = true;
894  }
895 
896  this->config.customTokensCookies.resize(completeTokens);
897 
898  // remove token expiration times that are not used, set to '0' where none is specified
899  if(this->config.customTokensKeep.size() > completeTokens) {
900  incompleteTokens = true;
901  }
902 
903  this->config.customTokensKeep.resize(completeTokens, 0);
904 
905  // remove token POST options that are not used, set to 'false' where none is specified
906  if(this->config.customTokensUsePost.size() > completeTokens) {
907  incompleteTokens = true;
908  }
909 
910  this->config.customTokensUsePost.resize(completeTokens, false);
911 
912  // remove token requirements that are not used, set to 'false' where none is specified
913  if(this->config.customTokensRequired.size() > completeTokens) {
914  incompleteTokens = true;
915  }
916 
917  this->config.customTokensRequired.resize(completeTokens, false);
918 
919  // warn about unused property
920  if(incompleteTokens) {
921  this->warning("Unused token properties removed from configuration.");
922  }
923 
924  // check properties of variables for dynamic redirect
925  bool incompleteVars{false};
926 
927  const auto completeVars{
928  std::min({ // number of complete variables (= min. size of all arrays)
929  this->config.redirectVarNames.size(),
930  this->config.redirectVarQueries.size(),
931  this->config.redirectVarSources.size()
932  })
933  };
934 
935  // remove redirect variable names that are not used
936  if(this->config.redirectVarNames.size() > completeVars) {
937  this->config.redirectVarNames.resize(completeVars);
938 
939  incompleteVars = true;
940  }
941 
942  // remove redirect queries that are not used
943  if(this->config.redirectVarQueries.size() > completeVars) {
944  this->config.redirectVarQueries.resize(completeVars);
945 
946  incompleteVars = true;
947  }
948 
949  // remove redirect sources that are not used
950  if(this->config.redirectVarSources.size() > completeVars) {
951  this->config.redirectVarSources.resize(completeVars);
952 
953  incompleteVars = true;
954  }
955 
956  // warn about incomplete counters
957  if(incompleteVars) {
958  this->warning(
959  "'redirect.var.names', '.var.sources' and '.var.queries'"
960  " should have the same number of elements."
961  );
962 
963  this->warning("Incomplete variable(s) removed form configuration.");
964  }
965  }
966 
968  inline void Config::reset() {
969  this->config = {};
970  }
971 
972 } /* namespace crawlservpp::Module::Crawler */
973 
974 #endif /* MODULE_CRAWLER_CONFIG_HPP_ */
bool customCountersGlobal
Specifies whether to use every counter for all custom URLs.
Definition: Config.hpp:408
std::vector< std::uint64_t > crawlerQueriesWhiteListTypes
If not empty, only content types matching one of these queries will be crawled.
Definition: Config.hpp:287
std::uint64_t crawlerSleepError
Sleeping time on connection errors, in milliseconds.
Definition: Config.hpp:331
std::vector< std::int64_t > customCountersStart
Start value for the counter with the same array index.
Definition: Config.hpp:411
bool crawlerUrlStartupCheck
Specifies whether to check the URL list before starting to crawl.
Definition: Config.hpp:380
std::vector< std::string > crawlerReCrawlAlways
List of URLs that will always be re-crawled.
Definition: Config.hpp:301
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
Configuration entries for crawler threads.
Definition: Config.hpp:130
std::vector< std::uint64_t > crawlerQueriesLinksWhiteListTypes
If not empty, only content types matching one of these queries will be used for link extraction...
Definition: Config.hpp:263
std::vector< std::string > customTokenHeaders
Custom HTTP headers to be used for ALL tokens.
Definition: Config.hpp:423
std::vector< std::uint64_t > customTokensQuery
Query to extract the token with the same array index.
Definition: Config.hpp:435
constexpr std::uint16_t defaultUrlMaxLength
Default maximum length of URLs to add.
Definition: Config.hpp:102
std::vector< bool > customTokensRequired
Determines whether an error occurs if the token with the same array index is empty.
Definition: Config.hpp:438
std::vector< std::string > crawlerParamsAdd
URL parameters that will be added shortly before retrieving the content.
Definition: Config.hpp:182
std::vector< std::string > crawlerParamsBlackList
Parameters in URLs that will be ignored.
Definition: Config.hpp:189
bool crawlerUrlDebug
Specifies whether to perform additional check for duplicates after URL insertion. ...
Definition: Config.hpp:374
std::vector< std::uint64_t > crawlerQueriesLinks
Queries on content to find URLs.
Definition: Config.hpp:223
Class for crawler configuration exceptions.
Definition: Config.hpp:521
std::vector< std::string > crawlerArchivesUrlsMemento
Memento URI templates for archives to crawl.
Definition: Config.hpp:152
std::vector< std::string > crawlerArchivesUrlsSkip
Memento URIs that will always be skipped.
Definition: Config.hpp:155
constexpr std::uint32_t defaultCrawlerLockS
Default time to lock URLs that are being processed, in seconds.
Definition: Config.hpp:72
std::vector< std::string > crawlerParamsWhiteList
Parameters in URLs that will not be ignored.
Definition: Config.hpp:196
std::vector< std::uint64_t > crawlerQueriesLinksWhiteListContent
If not empty, only content matching one of these queries will be used for link extraction.
Definition: Config.hpp:255
bool crawlerArchivesOnly
Specified whether to crawl archived pages ONLY.
Definition: Config.hpp:145
bool customRobots
Specifies whether to add the sitemaps specified in robots.txt as custom URLs.
Definition: Config.hpp:420
std::vector< std::uint64_t > crawlerQueriesBlackListTypes
Content types matching one of these queries will not be crawled.
Definition: Config.hpp:212
std::vector< std::string > customTokensCookies
Custom HTTP Cookie header for the token with the same array index.
Definition: Config.hpp:429
Abstract class containing the network-specific configuration for threads.
Definition: Config.hpp:121
std::uint32_t crawlerTidyErrors
Number of tidyhtml errors to log (if logging is enabled).
Definition: Config.hpp:353
std::uint64_t redirectQueryUrl
Query on URL that specifies whether to redirect to different URL.
Definition: Config.hpp:485
std::uint8_t crawlerLogging
Level of logging acivitiy.
Definition: Config.hpp:172
std::vector< bool > customTokensUsePost
Use HTTP POST instead of GET for the token with the same array index.
Definition: Config.hpp:444
bool crawlerStartIgnore
Specifies whether to not crawl the start page.
Definition: Config.hpp:350
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr std::array defaultRetryHttp
HTTP errors that will be handled like connection errors by default.
Definition: Config.hpp:84
Namespace for crawler classes.
Definition: Config.hpp:44
constexpr std::uint8_t crawlerLoggingSilent
Logging is disabled.
Definition: Config.hpp:54
std::vector< std::string > customTokens
List of token variables to be replaced in custom URLs.
Definition: Config.hpp:426
void reset() override
Resets the crawler-specific configuration options.
Definition: Config.hpp:968
std::uint64_t crawlerSleepMySql
Time to wait before trying to re-connect to the MySQL server, in seconds.
Definition: Config.hpp:344
bool crawlerRepairCData
Specifies whether to (try to) repair CData when parsing HTML/XML.
Definition: Config.hpp:307
std::vector< std::string > crawlerArchivesUrlsTimemap
Timemap URI template for archives to crawl.
Definition: Config.hpp:162
constexpr std::uint64_t defaultUrlChunks
Default number of crawled URLs to be processed at once without possible interruption.
Definition: Config.hpp:99
std::vector< std::uint64_t > crawlerQueriesWhiteListContent
If not empty, only content matching one of these queries will be crawled.
Definition: Config.hpp:279
std::string redirectTo
Sub-URL (for cross-domain websites: URL without protocol) to redirect to.
Definition: Config.hpp:488
std::vector< std::uint64_t > crawlerQueriesBlackListUrls
URLs matching one of these queries will not be crawled.
Definition: Config.hpp:220
bool crawlerRemoveXmlInstructions
Specifies whether to remove XML processing instructions (<?xml:...>) before parsing HTML content...
Definition: Config.hpp:313
std::vector< std::string > customUrls
Custom URLs for crawling (should all start with / except for cross-domain websites).
Definition: Config.hpp:447
std::vector< std::int64_t > customCountersStep
Step value for the counter with the same array index.
Definition: Config.hpp:414
constexpr std::uint8_t crawlerLoggingDefault
Default logging is enabled.
Definition: Config.hpp:57
constexpr std::uint8_t redirectSourceContent
Performing a query on the content of a crawled web page to determine whether to redirect.
Definition: Config.hpp:69
std::vector< std::uint8_t > redirectVarSources
Source type of the variable with the same index.
Definition: Config.hpp:510
bool crawlerRetryArchive
Specifies whether to re-try when retrieving the archived pages fails.
Definition: Config.hpp:322
bool redirectUsePost
Specifies whether to use HTTP POST instead of HTTP GET to retrieve a URL after redirect.
Definition: Config.hpp:491
constexpr std::int32_t defaultRestartAfter
Default time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactiva...
Definition: Config.hpp:78
std::uint16_t crawlerUrlMaxLength
Maximum length of URLs to add.
Definition: Config.hpp:377
std::vector< std::uint64_t > crawlerQueriesLinksBlackListUrls
URLs matching one of these queries will not be used for link extraction.
Definition: Config.hpp:247
void parseOption() override
Parses an crawler-specific configuration option.
Definition: Config.hpp:559
bool crawlerUrlCaseSensitive
Specifies whether URLs are case-sensitive.
Definition: Config.hpp:365
std::string crawlerStart
Starting point for crawling (should start with / except for cross-domain websites).
Definition: Config.hpp:347
std::uint64_t redirectQueryContent
Query on content that specifies whether to redirect to different URL.
Definition: Config.hpp:482
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::uint64_t crawlerSleepIdle
Time that will be waited before checking for new URLs when all URLs have been crawled, in milliseconds.
Definition: Config.hpp:337
std::vector< std::uint64_t > crawlerQueriesLinksBlackListContent
Content matching one of these queries will not be used for link extraction.
Definition: Config.hpp:231
std::vector< std::uint64_t > crawlerQueriesLinksBlackListTypes
Content types matching one of these queries will not be used for link extraction. ...
Definition: Config.hpp:239
std::vector< std::uint32_t > customTokensKeep
Time until the token with the same array index gets invalid, in seconds.
Definition: Config.hpp:432
std::uint32_t crawlerLock
Time for which to lock URLs that are currently being processed, in seconds.
Definition: Config.hpp:165
std::uint64_t crawlerSleepHttp
Time that will be waited between HTTP requests, in milliseconds.
Definition: Config.hpp:334
bool crawlerTidyWarnings
Specifies whether to log tidyhtml warnings (if logging is enabled).
Definition: Config.hpp:356
std::string redirectCookies
Custom HTTP Cookie header on dynamic redirect.
Definition: Config.hpp:476
std::uint64_t crawlerUrlChunks
Number of crawled URLs to be processed at once without possible interruption.
Definition: Config.hpp:368
constexpr std::uint64_t defaultSleepErrorMs
Default sleeping time on connection errors, in milliseconds.
Definition: Config.hpp:87
bool crawlerTiming
Specifies whether to calculate timing statistics for the crawler.
Definition: Config.hpp:359
std::vector< std::string > customCountersAlias
Alias for the counter variable with the same array index.
Definition: Config.hpp:396
std::vector< std::uint64_t > redirectVarQueries
Query on variable source to retrieve the value of the variable with the same index.
Definition: Config.hpp:503
bool customReCrawl
Specifies whether to always re-crawl custom URLs.
Definition: Config.hpp:417
constexpr std::int64_t defaultReTries
Default number of re-tries on connection errors.
Definition: Config.hpp:81
std::vector< std::uint64_t > crawlerQueriesBlackListContent
Content matching one of these queries will not be crawled.
Definition: Config.hpp:204
bool crawlerXml
Specifies whether to always save crawled content as cleaned XML.
Definition: Config.hpp:386
constexpr std::uint8_t redirectSourceUrl
Performing a query on the URL of a crawled web page to determine whether to redirect.
Definition: Config.hpp:66
bool crawlerArchives
Specifies whether to crawl archived pages.
Definition: Config.hpp:135
std::vector< std::uint32_t > crawlerRetryHttp
HTTP errors that will be handled like connection errors.
Definition: Config.hpp:328
constexpr std::uint8_t crawlerLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:63
struct crawlservpp::Module::Crawler::Config::Entries config
Configuration of the crawler.
constexpr std::uint64_t defaultSleepHttpMs
Default time that will be waited between HTTP requests, in milliseconds.
Definition: Config.hpp:90
bool expectedErrorIfSmaller
Specifies whether to throw an exception when number of expected URLs is subceeded.
Definition: Config.hpp:466
bool crawlerReCrawlStart
Specifies whether to re-crawl the start page every time to keep the URL list up-to-date.
Definition: Config.hpp:304
std::vector< std::string > redirectHeaders
Custom HTTP headers on dynamic redirect.
Definition: Config.hpp:479
bool customUsePost
Specifies whether to use HTTP POST instead of HTTP GET to retrieve custom URLs.
Definition: Config.hpp:455
bool crawlerRetryEmpty
Specifies whether empty responses will be handled like connection errors.
Definition: Config.hpp:325
bool expectedErrorIfLarger
Specifies whether to throw an exception when number of expected URLs is exceeded. ...
Definition: Config.hpp:463
std::uint64_t expectedQuery
Query to be performed on content to retrieve the expected number of URLs.
Definition: Config.hpp:469
bool crawlerRepairComments
Specifies whether to (try to) repair broken HTML/XML comments.
Definition: Config.hpp:310
bool crawlerReCrawl
Specifies whether to re-crawl already crawled URLs.
Definition: Config.hpp:298
std::vector< std::string > redirectVarNames
Variable names to be replaced on redirect.
Definition: Config.hpp:500
void setCrossDomain(bool isCrossDomain)
Sets whether the corresponding website is cross-domain.
Definition: Config.hpp:550
std::vector< std::int64_t > customCountersEnd
End value for the counter with the same array index.
Definition: Config.hpp:402
std::vector< std::uint64_t > crawlerQueriesLinksWhiteListUrls
If not empty, only URLs matching one of these queries will be used for link extraction.
Definition: Config.hpp:271
std::vector< std::uint64_t > customCountersAliasAdd
Value to add to the counter alias with the same array index.
Definition: Config.hpp:399
Configuration for crawlers.
Definition: Config.hpp:111
void checkOptions() override
Checks the crawler-specific configuration options.
Definition: Config.hpp:676
constexpr std::uint16_t defaultMaxBatchSize
Default number of URLs to be processed in one MySQL query.
Definition: Config.hpp:75
std::int64_t crawlerReTries
Number of re-tries on connection errors (-1=infinite).
Definition: Config.hpp:319
bool crawlerWarningsFile
Specifies whether to warn when files are found (as opposed to folders).
Definition: Config.hpp:383
std::uint16_t crawlerMaxBatchSize
Maximum number of URLs processed in one MySQL query.
Definition: Config.hpp:175
std::vector< std::string > customTokensSource
Source URL for the token with the same array index (absolute link without protocol).
Definition: Config.hpp:441
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
std::vector< std::string > customCounters
List of counter variables to be replaced in custom URLs.
Definition: Config.hpp:393
constexpr std::uint64_t defaultSleepIdleMs
Default time that will be waited before checking for new URLs when all URLs have been crawled...
Definition: Config.hpp:93
std::vector< std::uint64_t > crawlerQueriesWhiteListUrls
If not empty, only URLs matching one of these queries will be crawled.
Definition: Config.hpp:295
std::int32_t crawlerRestartAfter
Time (in s) after which to re-crawl custom URLs once crawling has been completed (-1=deactivated).
Definition: Config.hpp:316
constexpr std::uint64_t defaultSleepMySqlS
Default time to wait before the first try to re-connect to the MySQL server, in seconds.
Definition: Config.hpp:96
constexpr std::uint8_t crawlerLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:60
std::vector< std::string > crawlerArchivesNames
Names of archives to crawl.
Definition: Config.hpp:138