crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Config.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Config.hpp
24  *
25  * Network configuration. This class is used by both the crawler and the extractor.
26  *
27  * WARNING: Changing the configuration requires updating 'json/include/network.json'
28  * in crawlserv_frontend! See there for details on the specific configuration entries.
29  *
30  * Created on: Jan 8, 2019
31  * Author: ans
32  */
33 
34 #ifndef NETWORK_CONFIG_HPP_
35 #define NETWORK_CONFIG_HPP_
36 
37 #include "../Module/Config.hpp"
38 
39 #include <cstdint> // std::int64_t, std::uint16_t, std::uint64_t
40 #include <string> // std::string
41 #include <string_view> // std::string_view_literals
42 #include <vector> // std::vector
43 
46 
47  /*
48  * CONSTANTS
49  */
50 
51  using std::string_view_literals::operator""sv;
52 
55 
57  inline constexpr std::uint16_t httpVersionAny{0};
58 
60  inline constexpr std::uint16_t httpVersion1{1};
61 
63  inline constexpr std::uint16_t httpVersion11{2};
64 
66  inline constexpr std::uint16_t httpVersion2{3};
67 
69  inline constexpr std::uint16_t httpVersion2Only{4};
70 
72  inline constexpr std::uint16_t httpVersion2Tls{5};
73 
75 
78  inline constexpr std::uint16_t httpVersion3Only{6};
79 
81  inline constexpr std::uint16_t defaultConnectionsMax{5};
82 
84  inline constexpr std::int64_t defaultDnsCacheTimeOut{60};
85 
87  inline constexpr std::uint64_t defaultRedirectMax{20};
88 
90  inline constexpr std::uint64_t defaultResetTorOnlyAfter{60};
91 
93  inline constexpr std::uint64_t defaultTcpKeepAliveIdle{60};
94 
96  inline constexpr std::uint64_t defaultTcpKeepAliveInterval{60};
97 
99  inline constexpr std::uint64_t defaultTimeOut{300};
100 
102  inline constexpr std::uint64_t defaultTimeOutRequest{300};
103 
105  inline constexpr auto defaultProtocol{"https://"sv};
106 
108 
109  /*
110  * DECLARATION
111  */
112 
114 
121  class Config : protected Module::Config {
122  public:
125 
127 
132  struct Entries {
135 
138 
140  bool contentLengthIgnore{false};
141 
143 
155  bool cookies{false};
156 
158 
164  std::string cookiesLoad;
165 
167 
171  std::vector<std::string> cookiesOverwrite;
172 
174 
180  std::string cookiesSave;
181 
183 
187  bool cookiesSession{true};
188 
190 
197  std::string cookiesSet;
198 
200 
204 
206 
209  std::string dnsDoH;
210 
212 
215  std::string dnsInterface;
216 
218 
229  std::vector<std::string> dnsResolves;
230 
232  std::vector<std::string> dnsServers;
233 
235  bool dnsShuffle{false};
236 
238 
247  bool encodingBr{true};
248 
250 
259  bool encodingDeflate{true};
260 
262 
271  bool encodingGZip{true};
272 
274 
283  bool encodingIdentity{true};
284 
286  bool encodingTransfer{false};
287 
289 
298  bool encodingZstd{false};
299 
301  std::vector<std::string> headers;
302 
304  std::vector<std::string> http200Aliases;
305 
307 
313  std::uint16_t httpVersion{httpVersion2Tls};
314 
316 
319  std::string localInterface;
320 
322 
327  std::uint16_t localPort{};
328 
330 
340  std::uint16_t localPortRange{1};
341 
343 
348  bool noReUse{false};
349 
351 
356  std::string proxy;
357 
359 
367  std::string proxyAuth;
368 
370 
373  std::vector<std::string> proxyHeaders;
374 
376 
379  std::string proxyPre;
380 
382 
387  std::string proxyTlsSrpPassword;
388 
390 
396  std::string proxyTlsSrpUser;
397 
399  bool proxyTunnelling{false};
400 
402 
405  bool redirect{true};
406 
408 
414 
416 
419  bool redirectPost301{false};
420 
422 
425  bool redirectPost302{false};
426 
428 
431  bool redirectPost303{false};
432 
434 
440  std::string referer;
441 
443 
446  bool refererAutomatic{false};
447 
449 
452  bool resetTor{true};
453 
455 
464  std::uint64_t resetTorAfter{};
465 
467 
471 
473 
476  std::uint64_t speedDownLimit{};
477 
479 
484  std::uint64_t speedLowLimit{};
485 
487 
493  std::uint64_t speedLowTime{};
494 
496 
499  std::uint64_t speedUpLimit{};
500 
502 
506  bool sslVerifyHost{true};
507 
509 
513  bool sslVerifyPeer{true};
514 
516 
520  bool sslVerifyProxyHost{true};
521 
523 
527  bool sslVerifyProxyPeer{true};
528 
530 
542  bool sslVerifyStatus{false};
543 
545  bool tcpFastOpen{false};
546 
548  bool tcpKeepAlive{false};
549 
551 
555 
557 
561 
563 
568  bool tcpNagle{false};
569 
571 
577  std::uint64_t timeOut{defaultTimeOut};
578 
580 
588  std::uint16_t timeOutHappyEyeballs{};
589 
591 
598 
600 
606  std::string tlsSrpUser;
607 
609 
615  std::string tlsSrpPassword;
616 
618 
621  std::string userAgent;
622 
624 
630  bool verbose{false};
631 
633 
637  std::string protocol{defaultProtocol};
638  }
639 
642 
646 
647  void parseBasicOption() override;
648  void resetBase() override;
649 
651 
658  void parseOption() override = 0;
659 
661 
668  void reset() override = 0;
669 
673 
674  [[nodiscard]] const std::string& getProtocol() const;
675 
677  };
678 
679  /*
680  * IMPLEMENTATION
681  */
682 
683  /*
684  * PARSING (NETWORK CONFIGURATION)
685  */
686 
688  inline void Config::parseBasicOption() {
689  this->category("network");
690 
691  this->option("connections.max", this->networkConfig.connectionsMax);
692  this->option("contentlength.ignore", this->networkConfig.contentLengthIgnore);
693  this->option("cookies", this->networkConfig.cookies);
694  this->option("cookies.load", this->networkConfig.cookiesLoad);
695  this->option("cookies.overwrite", this->networkConfig.cookiesOverwrite);
696  this->option("cookies.save", this->networkConfig.cookiesSave);
697  this->option("cookies.session", this->networkConfig.cookiesSession);
698  this->option("cookies.set", this->networkConfig.cookiesSet);
699  this->option("dns.cachetimeout", this->networkConfig.dnsCacheTimeOut);
700  this->option("dns.doh", this->networkConfig.dnsDoH);
701  this->option("dns.interface", this->networkConfig.dnsInterface);
702  this->option("dns.resolves", this->networkConfig.dnsResolves);
703  this->option("dns.servers", this->networkConfig.dnsServers);
704  this->option("dns.shuffle", this->networkConfig.dnsShuffle);
705  this->option("encoding.br", this->networkConfig.encodingBr);
706  this->option("encoding.deflate", this->networkConfig.encodingDeflate);
707  this->option("encoding.gzip", this->networkConfig.encodingGZip);
708  this->option("encoding.identity", this->networkConfig.encodingIdentity);
709  this->option("encoding.transfer", this->networkConfig.encodingTransfer);
710  this->option("encoding.zstd", this->networkConfig.encodingZstd);
711  this->option("headers", this->networkConfig.headers);
712  this->option("http.200aliases", this->networkConfig.http200Aliases);
713  this->option("http.version", this->networkConfig.httpVersion);
714  this->option("local.interface", this->networkConfig.localInterface);
715  this->option("local.port", this->networkConfig.localPort);
716  this->option("local.portrange", this->networkConfig.localPortRange);
717  this->option("no.reuse", this->networkConfig.noReUse);
718  this->option("proxy", this->networkConfig.proxy);
719  this->option("proxy.auth", this->networkConfig.proxyAuth);
720  this->option("proxy.headers", this->networkConfig.proxyHeaders);
721  this->option("proxy.pre", this->networkConfig.proxyPre);
722  this->option("proxy.tlssrp.password", this->networkConfig.proxyTlsSrpPassword);
723  this->option("proxy.tlssrp.user", this->networkConfig.proxyTlsSrpUser);
724  this->option("proxy.tunnelling", this->networkConfig.proxyTunnelling);
725  this->option("redirect", this->networkConfig.redirect);
726  this->option("redirect.max", this->networkConfig.redirectMax);
727  this->option("redirect.post301", this->networkConfig.redirectPost301);
728  this->option("redirect.post302", this->networkConfig.redirectPost302);
729  this->option("redirect.post303", this->networkConfig.redirectPost303);
730  this->option("referer", this->networkConfig.referer);
731  this->option("referer.automatic", this->networkConfig.refererAutomatic);
732  this->option("reset.tor", this->networkConfig.resetTor);
733  this->option("reset.tor.after", this->networkConfig.resetTorAfter);
734  this->option("reset.tor.only.after", this->networkConfig.resetTorOnlyAfter);
735  this->option("speed.downlimit", this->networkConfig.speedDownLimit);
736  this->option("speed.lowlimit", this->networkConfig.speedLowLimit);
737  this->option("speed.lowtime", this->networkConfig.speedLowTime);
738  this->option("speed.uplimit", this->networkConfig.speedUpLimit);
739  this->option("ssl.verify.host", this->networkConfig.sslVerifyHost);
740  this->option("ssl.verify.peer", this->networkConfig.sslVerifyPeer);
741  this->option("ssl.verify.proxy.host", this->networkConfig.sslVerifyProxyHost);
742  this->option("ssl.verify.proxy.peer", this->networkConfig.sslVerifyProxyPeer);
743  this->option("ssl.verify.status", this->networkConfig.sslVerifyStatus);
744  this->option("tcp.fastopen", this->networkConfig.tcpFastOpen);
745  this->option("tcp.keepalive", this->networkConfig.tcpKeepAlive);
746  this->option("tcp.keepalive.idle", this->networkConfig.tcpKeepAliveIdle);
747  this->option("tcp.keepalive.interval", this->networkConfig.tcpKeepAliveInterval);
748  this->option("tcp.nagle", this->networkConfig.tcpNagle);
749  this->option("timeout", this->networkConfig.timeOut);
750  this->option("timeout.happyeyeballs", this->networkConfig.timeOutHappyEyeballs);
751  this->option("timeout.request", this->networkConfig.timeOutRequest);
752  this->option("tlssrp.password", this->networkConfig.tlsSrpPassword);
753  this->option("tlssrp.user", this->networkConfig.tlsSrpUser);
754  this->option("useragent", this->networkConfig.userAgent);
755  this->option("verbose", this->networkConfig.verbose);
756 
757  bool insecure{false};
758 
759  this->option("insecure", insecure);
760 
761  if(insecure) {
762  this->warning("Using INSECURE connections.");
763 
764  this->networkConfig.protocol = "http://";
765  }
766 
767  this->parseOption();
768  }
769 
771  inline void Config::resetBase() {
772  this->networkConfig = {};
773 
774  this->reset();
775  }
776 
777  /*
778  * HELPER (NETWORK CONFIGURATION)
779  */
780 
782 
787  inline const std::string& Config::getProtocol() const {
788  return this->networkConfig.protocol;
789  }
790 
791 } /* namespace crawlservpp::Network */
792 
793 #endif /* NETWORK_CONFIG_HPP_ */
constexpr std::uint64_t defaultTimeOutRequest
Default request time-out, in seconds.
Definition: Config.hpp:102
std::uint64_t resetTorAfter
Number of seconds until automatically using the TOR control server to request a new identity...
Definition: Config.hpp:464
bool encodingIdentity
Specifies whether to (also) request non-compressed encoding for requested content.
Definition: Config.hpp:283
std::vector< std::string > headers
Custom HTTP headers to be sent with every request.
Definition: Config.hpp:301
std::vector< std::string > http200Aliases
Aliases that will be treated like HTTP/1.0 200 OK.
Definition: Config.hpp:304
bool cookiesSession
Specifies whether to ignore obsolete session cookies.
Definition: Config.hpp:187
constexpr std::uint16_t httpVersion2
Attempt to use HTTP/2, fall back to HTTP/1.1.
Definition: Config.hpp:66
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
std::uint64_t speedUpLimit
Maximum upload speed in bytes per second.
Definition: Config.hpp:499
std::string proxyAuth
Authentification for the proxy server used.
Definition: Config.hpp:367
bool sslVerifyProxyHost
Specifies whether to verify that the SSL certificate is for the proxy server it is known as...
Definition: Config.hpp:520
std::string tlsSrpPassword
User name used for TLS-SRP authentification.
Definition: Config.hpp:615
std::uint16_t httpVersion
HTTP version(s) to be used.
Definition: Config.hpp:313
std::string protocol
The protocol to be used for HTTP requests.
Definition: Config.hpp:637
std::string proxy
Proxy server used.
Definition: Config.hpp:356
std::uint64_t speedDownLimit
Maximum download speed in bytes per second.
Definition: Config.hpp:476
std::int64_t dnsCacheTimeOut
The lifetime of DNS cache entries.
Definition: Config.hpp:203
constexpr std::uint16_t httpVersion3Only
Use HTTP/3 only.
Definition: Config.hpp:78
constexpr std::uint16_t httpVersion2Tls
Attempt to use HTTP/2 over TLS, fall back to HTTP/1.1.
Definition: Config.hpp:72
std::string localInterface
Interface to be used for outgoing traffic.
Definition: Config.hpp:319
bool dnsShuffle
Specifies whether to shuffle addresses when a host name returns more than one.
Definition: Config.hpp:235
std::string dnsDoH
The URL of a custom DNS-over-HTTPS (DoH) server.
Definition: Config.hpp:209
constexpr std::uint64_t defaultRedirectMax
Default maximum number of automatic redirects by default.
Definition: Config.hpp:87
std::uint16_t localPort
Port to be used for outgoing traffic.
Definition: Config.hpp:327
struct crawlservpp::Network::Config::Entries networkConfig
Configuration for networking.
const std::string & getProtocol() const
Gets the protocol to be used for networking.
Definition: Config.hpp:787
std::string tlsSrpUser
Password used for TLS-SRP authentification.
Definition: Config.hpp:606
std::string referer
The HTTP Referer header to be set.
Definition: Config.hpp:440
Abstract class as base for module-specific configurations.
Definition: Config.hpp:122
Abstract class containing the network-specific configuration for threads.
Definition: Config.hpp:121
bool contentLengthIgnore
Specifies whether the Content-Length header in HTTP responses will be ignored.
Definition: Config.hpp:140
bool encodingZstd
Specifies whether to request Zstandard encoding for requested content.
Definition: Config.hpp:298
bool tcpNagle
Specifies whether the TCP&#39;s Nagle algorithm is enabled on this connection.
Definition: Config.hpp:568
bool sslVerifyHost
Specifies whether to verify that the SSL certificate is for the server it is known as...
Definition: Config.hpp:506
bool sslVerifyPeer
Specifies whether to verify the authenticity of the server&#39;s SSL certificate.
Definition: Config.hpp:513
bool redirect
Specifies whether to follow HTTP Location headers for automatic redirects.
Definition: Config.hpp:405
constexpr std::uint16_t httpVersion2Only
Use non-TLS HTTP/2, even if HTTPS is not available.
Definition: Config.hpp:69
std::uint64_t speedLowLimit
Low speed limit in bytes per second.
Definition: Config.hpp:484
std::string proxyPre
Pre-proxy server to be used.
Definition: Config.hpp:379
void reset() override=0
Resets additional configuration options.
bool tcpFastOpen
Specifies whether TCP Fast Open will be enabled.
Definition: Config.hpp:545
std::uint16_t localPortRange
Number of ports to be tried for outgoing traffic.
Definition: Config.hpp:340
std::uint64_t timeOutRequest
The maximum amount of time a request is allowed to take, in seconds.
Definition: Config.hpp:597
constexpr std::uint16_t httpVersion11
Use HTTP/1.1 only.
Definition: Config.hpp:63
bool cookies
Specifies whether the internal cookie engine will be enabled.
Definition: Config.hpp:155
constexpr std::uint64_t defaultTimeOut
Default connecting time-out, in seconds.
Definition: Config.hpp:99
bool encodingGZip
Specifies whether to request gzip encoding for requested content.
Definition: Config.hpp:271
constexpr std::uint16_t httpVersion1
Use HTTP/1 only.
Definition: Config.hpp:60
std::uint64_t speedLowTime
Number of seconds before a timeout occurs while the transfer speed is below the low speed limit...
Definition: Config.hpp:493
std::string cookiesLoad
The file from which cookies will be read.
Definition: Config.hpp:164
constexpr std::int64_t defaultDnsCacheTimeOut
Default lifetime of DNS cache entries.
Definition: Config.hpp:84
constexpr std::uint16_t defaultConnectionsMax
Default maximum number of connections.
Definition: Config.hpp:81
bool sslVerifyProxyPeer
Specifies whether to verify the authenticity of the proxy&#39;s SSL certificate.
Definition: Config.hpp:527
bool verbose
Specifies whether libtidy should produce verbose output.
Definition: Config.hpp:630
bool redirectPost303
Specifies whether to NOT convert POST to GET requests when following 303 redirects.
Definition: Config.hpp:431
void parseBasicOption() override
Parses basic network configuration options.
Definition: Config.hpp:688
bool refererAutomatic
Specifies whether to send an updated HTTP Referer header when automatically redirected.
Definition: Config.hpp:446
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
bool redirectPost301
Specifies whether to NOT convert POST to GET requests when following 301 redirects.
Definition: Config.hpp:419
Configuration entries for analyzer threads.
Definition: Config.hpp:132
bool resetTor
Specifies whether to use the TOR control server to request a new identity on connection resets...
Definition: Config.hpp:452
bool sslVerifyStatus
Specifies whether to verify the status of the server&#39;s SSL certificate.
Definition: Config.hpp:542
constexpr auto defaultProtocol
Default protocol.
Definition: Config.hpp:105
bool encodingDeflate
Specifies whether to request DEFLATE encoding for requested content.
Definition: Config.hpp:259
std::string proxyTlsSrpPassword
TSL-SRP password for the proxy server used.
Definition: Config.hpp:387
bool proxyTunnelling
Specifies whether to enable proxy tunnelling.
Definition: Config.hpp:399
bool redirectPost302
Specifies whether to NOT convert POST to GET requests when following 302 redirects.
Definition: Config.hpp:425
std::vector< std::string > proxyHeaders
Custom HTTP headers to be sent to the proxy server.
Definition: Config.hpp:373
std::uint64_t tcpKeepAliveIdle
The delay that will be waited before sending keep-alive probes, in seconds.
Definition: Config.hpp:554
bool encodingTransfer
Specifies whether to request HTTP Transfer Encoding.
Definition: Config.hpp:286
std::string userAgent
Custom HTTP User-Agent header to be sent with all HTTP requests.
Definition: Config.hpp:621
std::vector< std::string > dnsResolves
DNS name resolves to be overwritten.
Definition: Config.hpp:229
void resetBase() override
Resets basic network configuration options.
Definition: Config.hpp:771
std::uint64_t tcpKeepAliveInterval
The interval time between keep-alive probes to sent, in seconds.
Definition: Config.hpp:560
std::uint16_t timeOutHappyEyeballs
Number of milliseconds to try to connect only via IPv6 using the Happy Eyeballs algorithm.
Definition: Config.hpp:588
constexpr std::uint16_t httpVersionAny
Use any available HTTP version.
Definition: Config.hpp:57
std::string cookiesSet
Custom HTTP Cookie header independent from the internal cookie engine.
Definition: Config.hpp:197
std::uint16_t connectionsMax
The maximum number of parallel connections.
Definition: Config.hpp:137
std::string proxyTlsSrpUser
TSL-SRP user for the proxy server used.
Definition: Config.hpp:396
bool tcpKeepAlive
Specifies whether TCP keep-alive probing will be enabled.
Definition: Config.hpp:548
std::string dnsInterface
The interface that DNS name resolves should be bound to.
Definition: Config.hpp:215
bool noReUse
Specifies whether to prevent connections from re-using previous ones.
Definition: Config.hpp:348
std::uint64_t timeOut
The maximum amount of time a connection is allowed to take, in seconds.
Definition: Config.hpp:577
std::vector< std::string > dnsServers
DNS servers to be preffered.
Definition: Config.hpp:232
std::uint64_t resetTorOnlyAfter
Number of seconds that need to be parsed before new identity will be requested from the TOR control s...
Definition: Config.hpp:470
constexpr std::uint64_t defaultResetTorOnlyAfter
Default number of seconds that need to have been passed before requesting a new TOR identity...
Definition: Config.hpp:90
std::uint64_t redirectMax
The maximum number of automatic redirects.
Definition: Config.hpp:413
bool encodingBr
Specifies whether to request brotli encoding for requested content.
Definition: Config.hpp:247
std::vector< std::string > cookiesOverwrite
Cookies to be overwritten.
Definition: Config.hpp:171
void parseOption() override=0
Parses additional configuration options.
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
Namespace for networking classes.
Definition: Config.hpp:45
constexpr std::uint64_t defaultTcpKeepAliveInterval
Default interval for TCP Keep-alive probing, in seconds.
Definition: Config.hpp:96
std::string cookiesSave
The file to which cookies will be saved.
Definition: Config.hpp:180
constexpr std::uint64_t defaultTcpKeepAliveIdle
Default delay that will be waited before sending keep-alive probes, in seconds.
Definition: Config.hpp:93