crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Config.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Config.hpp
24  *
25  * Extracting configuration.
26  *
27  * Created on: May 9, 2019
28  * Author: ans
29  */
30 
31 #ifndef MODULE_EXTRACTOR_CONFIG_HPP_
32 #define MODULE_EXTRACTOR_CONFIG_HPP_
33 
34 #include "../../Network/Config.hpp"
35 
36 #include <algorithm> // std::min std::replace_if
37 #include <array> // std::array
38 #include <cstdint> // std::int64_t, std::uint8_t, std::uint32_t, std::uint64_t
39 #include <string> // std::string
40 #include <string_view> // std::string_view, std::string_view_literals
41 #include <vector> // std::vector
42 
45 
46  /*
47  * CONSTANTS
48  */
49 
50  using std::string_view_literals::operator""sv;
51 
54 
56  inline constexpr std::uint8_t crawlerLoggingVerbose{0};
57 
59  inline constexpr std::uint8_t generalLoggingDefault{1};
60 
62  inline constexpr std::uint8_t generalLoggingExtended{2};
63 
65  inline constexpr std::uint8_t generalLoggingVerbose{3};
66 
68  inline constexpr std::uint8_t variablesSourcesParsed{0};
69 
71  inline constexpr std::uint8_t variablesSourcesContent{1};
72 
74  inline constexpr std::uint8_t variablesSourcesUrl{2};
75 
77  inline constexpr std::uint8_t expectedSourceExtracting{0};
78 
80  inline constexpr std::uint8_t expectedSourceParsed{1};
81 
83  inline constexpr std::uint8_t expectedSourceContent{2};
84 
86  inline constexpr std::array defaultRetryHttpStatusCodes{429, 502, 503, 504};
87 
89  inline constexpr std::array protocolsToRemove{"http://"sv, "https://"sv};
90 
92  inline constexpr std::uint64_t defaultCacheSize{2500};
93 
95  inline constexpr std::uint32_t defaultLockS{300};
96 
98  inline constexpr std::uint16_t defaultMaxBatchSize{500};
99 
101  inline constexpr std::int64_t defaultReTries{720};
102 
104  inline constexpr std::uint64_t defaultSleepErrorMs{10000};
105 
107  inline constexpr std::uint64_t defaultSleepHttpMs{0};
108 
110  inline constexpr std::uint64_t defaultSleepIdleMs{5000};
111 
113  inline constexpr std::uint64_t defaultSleepMySqlS{60};
114 
116 
125  inline constexpr auto defaultPagingVariable{"$p"sv};
126 
128  inline constexpr std::uint64_t defaultRecursiveMaxDepth{100};
129 
131 
132  /*
133  * DECLARATION
134  */
135 
137  class Config : protected Network::Config {
138  // for convenience
140 
141  public:
144 
146 
151  struct Entries {
154 
156 
160 
162  bool generalExtractCustom{false};
163 
165  std::uint32_t generalLock{defaultLockS};
166 
169 
172 
175 
177  bool generalReExtract{false};
178 
180  std::string generalTargetTable;
181 
183 
189 
191  std::vector<std::uint32_t> generalRetryHttp{
194  };
195 
198 
201 
204 
207 
209 
213  std::uint32_t generalTidyErrors{};
214 
216 
220  bool generalTidyWarnings{false};
221 
223  bool generalTiming{false};
224 
228 
230 
239  std::vector<std::string> variablesAlias;
240 
242 
248  std::vector<std::int64_t> variablesAliasAdd;
249 
251 
267  std::vector<std::string> variablesDateTimeFormat;
268 
270 
278  std::vector<std::string> variablesDateTimeLocale;
279 
281  std::vector<std::uint64_t> variablesSkipQuery;
282 
284 
293  std::vector<std::string> variablesName;
294 
296 
302  std::vector<std::string> variablesParsedColumn;
303 
305 
312  std::vector<std::string> variablesParsedTable;
313 
315 
322  std::vector<std::uint64_t> variablesQuery;
323 
325 
340  std::vector<std::uint8_t> variablesSource;
341 
343 
355  std::vector<std::string> variablesTokens;
356 
358 
361  std::vector<std::string> variablesTokensCookies;
362 
364 
367  std::vector<std::uint64_t> variablesTokensQuery;
368 
370 
398  std::vector<std::string> variablesTokensSource;
399 
401 
404  std::vector<bool> variablesTokensUsePost;
405 
407 
410  std::vector<std::string> variablesTokenHeaders;
411 
415 
417 
426  std::string pagingAlias;
427 
429 
435  std::int64_t pagingAliasAdd{};
436 
438 
441  std::int64_t pagingFirst{};
442 
444 
452  std::string pagingFirstString;
453 
455 
459  std::uint64_t pagingIsNextFrom{};
460 
462 
471  std::uint64_t pagingNextFrom{};
472 
474 
484  std::uint64_t pagingNumberFrom{};
485 
487 
491  std::int64_t pagingStep{1};
492 
494 
504 
508 
510  std::string sourceCookies;
511 
513  std::vector<std::string> sourceHeaders;
514 
516 
521  std::string sourceUrl;
522 
524 
531  std::string sourceUrlFirst;
532 
534 
541  bool sourceUsePost{false};
542 
546 
548 
552  std::vector<std::uint64_t> extractingDatasetQueries;
553 
555 
572  std::vector<std::string> extractingDateTimeFormats;
573 
575 
580  std::vector<std::string> extractingDateTimeLocales;
581 
583 
590  std::vector<std::uint64_t> extractingDateTimeQueries;
591 
593 
597  std::vector<std::uint64_t> extractingErrorFail;
598 
600 
604  std::vector<std::uint64_t> extractingErrorRetry;
605 
607 
623  std::vector<std::string> extractingFieldDateTimeFormats;
624 
626 
634  std::vector<std::string> extractingFieldDateTimeLocales;
635 
637 
643  std::vector<char> extractingFieldDelimiters;
644 
646 
651  std::vector<bool> extractingFieldIgnoreEmpty;
652 
654 
657  std::vector<bool> extractingFieldJSON;
658 
660 
696  std::vector<std::string> extractingFieldNames;
697 
699 
702  std::vector<std::uint64_t> extractingFieldQueries;
703 
705 
708  std::vector<bool> extractingFieldTidyTexts;
709 
711 
717  std::vector<bool> extractingFieldWarningsEmpty;
718 
720  std::vector<std::string> extractingIdIgnore;
721 
723 
728  std::vector<std::uint64_t> extractingIdQueries;
729 
732 
734 
738  std::vector<std::uint64_t> extractingRecursive;
739 
742 
745 
748 
751 
754 
756  std::uint64_t extractingSkipQuery{};
757 
761 
763 
769  std::vector<std::uint64_t> linkedDatasetQueries;
770 
772 
788  std::vector<std::string> linkedDateTimeFormats;
789 
791 
799  std::vector<std::string> linkedDateTimeLocales;
800 
802 
808  std::vector<char> linkedDelimiters;
809 
811 
863  std::vector<std::string> linkedFieldNames;
864 
866 
869  std::vector<std::uint64_t> linkedFieldQueries;
870 
872 
875  std::vector<std::string> linkedIdIgnore;
876 
878 
885  std::vector<std::uint64_t> linkedIdQueries;
886 
888 
893  std::vector<bool> linkedIgnoreEmpty;
894 
896 
899  std::vector<bool> linkedJSON;
900 
902 
905  std::string linkedLink;
906 
908  bool linkedOverwrite{true};
909 
911  std::string linkedTargetTable;
912 
914 
917  std::vector<bool> linkedTidyTexts;
918 
920 
926  std::vector<bool> linkedWarningsEmpty;
927 
931 
934 
937 
939 
946  std::string expectedParsedColumn;
947 
949 
956  std::string expectedParsedTable;
957 
959 
966  std::uint64_t expectedQuery{};
967 
969 
978 
980  }
981 
983  config;
984 
986 
989 
990  protected:
993 
994  void parseOption() override;
995  void checkOptions() override;
996  void reset() override;
997 
999 
1000  private:
1001  // internal helper function
1002  static void removeProtocolsFromUrl(std::string& inOut);
1003  };
1004 
1005  /*
1006  * IMPLEMENTATION
1007  */
1008 
1009  /*
1010  * EXTRACTOR-SPECIFIC CONFIGURATION PARSING
1011  */
1012 
1014  inline void Config::parseOption() {
1015  // general options
1016  this->category("general");
1017  this->option("cache.size", this->config.generalCacheSize);
1018  this->option("extract.custom", this->config.generalExtractCustom);
1019  this->option("lock", this->config.generalLock);
1020  this->option("logging", this->config.generalLogging);
1021  this->option("max.batch.size", this->config.generalMaxBatchSize);
1022  this->option("minimize.memory", this->config.generalMinimizeMemory);
1023  this->option("reextract", this->config.generalReExtract);
1024  this->option("retries", this->config.generalReTries);
1025  this->option("retry.http", this->config.generalRetryHttp);
1026  this->option("sleep.error", this->config.generalSleepError);
1027  this->option("sleep.http", this->config.generalSleepHttp);
1028  this->option("sleep.idle", this->config.generalSleepIdle);
1029  this->option("sleep.mysql", this->config.generalSleepMySql);
1030  this->option("target.table", this->config.generalTargetTable);
1031  this->option("tidy.errors", this->config.generalTidyErrors);
1032  this->option("tidy.warnings", this->config.generalTidyWarnings);
1033  this->option("timing", this->config.generalTiming);
1034 
1035  // variables
1036  this->category("variables");
1037  this->option("alias", this->config.variablesAlias);
1038  this->option("alias.add", this->config.variablesAliasAdd);
1039  this->option("datetime.format", this->config.variablesDateTimeFormat);
1040  this->option("datetime.locale", this->config.variablesDateTimeLocale);
1041  this->option("skip.query", this->config.variablesSkipQuery);
1042  this->option("name", this->config.variablesName);
1043  this->option("parsed.column", this->config.variablesParsedColumn);
1044  this->option("parsed.table", this->config.variablesParsedTable);
1045  this->option("query", this->config.variablesQuery);
1046  this->option("source", this->config.variablesSource);
1047  this->option("tokens", this->config.variablesTokens);
1048  this->option("tokens.cookies", this->config.variablesTokensCookies);
1049  this->option("tokens.query", this->config.variablesTokensQuery);
1050  this->option("tokens.source", this->config.variablesTokensSource);
1051  this->option("tokens.use.post", this->config.variablesTokensUsePost);
1052  this->option("token.headers", this->config.variablesTokenHeaders);
1053 
1054  // paging
1055  this->category("paging");
1056  this->option("alias", this->config.pagingAlias);
1057  this->option("alias.add", this->config.pagingAliasAdd);
1058  this->option("first", this->config.pagingFirst);
1059  this->option("first.string", this->config.pagingFirstString);
1060  this->option("is.next.from", this->config.pagingIsNextFrom);
1061  this->option("next.from", this->config.pagingNextFrom);
1062  this->option("number.from", this->config.pagingNumberFrom);
1063  this->option("step", this->config.pagingStep);
1064  this->option("variable", this->config.pagingVariable);
1065 
1066  // source
1067  this->category("source");
1068  this->option("cookies", this->config.sourceCookies);
1069  this->option("headers", this->config.sourceHeaders);
1070  this->option("url", this->config.sourceUrl);
1071  this->option("url.first", this->config.sourceUrlFirst);
1072  this->option("use.post", this->config.sourceUsePost);
1073 
1074  // extracting
1075  this->category("extracting");
1076  this->option("dataset.queries", this->config.extractingDatasetQueries);
1077  this->option("datetime.formats", this->config.extractingDateTimeFormats);
1078  this->option("datetime.locales", this->config.extractingDateTimeLocales);
1079  this->option("datetime.queries", this->config.extractingDateTimeQueries);
1080  this->option("error.fail", this->config.extractingErrorFail);
1081  this->option("error.retry", this->config.extractingErrorRetry);
1082  this->option("field.datetime.formats", this->config.extractingFieldDateTimeFormats);
1083  this->option("field.datetime.locales", this->config.extractingFieldDateTimeLocales);
1084  this->option(
1085  "field.delimiters",
1087  CharParsingOption::FromString
1088  );
1089  this->option("field.ignore.empty", this->config.extractingFieldIgnoreEmpty);
1090  this->option("field.json", this->config.extractingFieldJSON);
1091  this->option("field.names", this->config.extractingFieldNames);
1092  this->option("field.queries", this->config.extractingFieldQueries);
1093  this->option("field.tidy.texts", this->config.extractingFieldTidyTexts);
1094  this->option("field.warnings.empty", this->config.extractingFieldWarningsEmpty);
1095  this->option("id.ignore", this->config.extractingIdIgnore);
1096  this->option("id.queries", this->config.extractingIdQueries);
1097  this->option("overwrite", this->config.extractingOverwrite);
1098  this->option("recursive", this->config.extractingRecursive);
1099  this->option("recursive.max.depth", this->config.extractingRecursiveMaxDepth);
1100  this->option("remove.duplicates", this->config.extractingRemoveDuplicates);
1101  this->option("remove.xml.instructions", this->config.extractingRemoveXmlInstructions);
1102  this->option("repair.cdata", this->config.extractingRepairCData);
1103  this->option("repair.comments", this->config.extractingRepairComments);
1104  this->option("skip.query", this->config.extractingSkipQuery);
1105 
1106  // linked data
1107  this->category("linked");
1108  this->option("dataset.queries", this->config.linkedDatasetQueries);
1109  this->option("datetime.formats", this->config.linkedDateTimeFormats);
1110  this->option("datetime.locales", this->config.linkedDateTimeLocales);
1111  this->option("delimiters", this->config.linkedDelimiters, CharParsingOption::FromString);
1112  this->option("field.names", this->config.linkedFieldNames);
1113  this->option("field.queries", this->config.linkedFieldQueries);
1114  this->option("id.ignore", this->config.linkedIdIgnore);
1115  this->option("id.queries", this->config.linkedIdQueries);
1116  this->option("ignore.empty", this->config.linkedIgnoreEmpty);
1117  this->option("json", this->config.linkedJSON);
1118  this->option("link", this->config.linkedLink);
1119  this->option("overwrite", this->config.linkedOverwrite);
1120  this->option("target.table", this->config.linkedTargetTable);
1121  this->option("tidy.texts", this->config.linkedTidyTexts);
1122  this->option("warnings.empty", this->config.linkedWarningsEmpty);
1123 
1124  // expected number of results
1125  this->category("expected");
1126  this->option("error.if.larger", this->config.expectedErrorIfLarger);
1127  this->option("error.if.smaller", this->config.expectedErrorIfSmaller);
1128  this->option("parsed.column", this->config.expectedParsedColumn);
1129  this->option("parsed.table", this->config.expectedParsedTable);
1130  this->option("query", this->config.expectedQuery);
1131  this->option("source", this->config.expectedSource);
1132  }
1133 
1135 
1139  inline void Config::checkOptions() {
1140  // check for target table
1141  if(this->config.generalTargetTable.empty()) {
1142  throw Exception(
1143  "Parser::Config::checkOptions():"
1144  " No target table has been specified."
1145  );
1146  }
1147 
1148  // remove obvious protocols from given URLs
1149  for(auto& url : this->config.variablesTokensSource) {
1150  Config::removeProtocolsFromUrl(url);
1151  }
1152 
1153  Config::removeProtocolsFromUrl(this->config.sourceUrl);
1154  Config::removeProtocolsFromUrl(this->config.sourceUrlFirst);
1155 
1156  // check properties of variables
1157  bool incompleteVariables{false};
1158 
1159  const auto completeVariables{
1160  std::min({
1161  /* number of complete variables (= min. size of name and source arrays) */
1162  this->config.variablesName.size(),
1163  this->config.variablesSource.size()
1164  })
1165  };
1166 
1167  // remove variable names that are not used
1168  if(this->config.variablesName.size() > completeVariables) {
1169  this->config.variablesName.resize(completeVariables);
1170 
1171  incompleteVariables = true;
1172  }
1173 
1174  // remove variable sources that are not used
1175  if(this->config.variablesSource.size() > completeVariables) {
1176  this->config.variablesSource.resize(completeVariables);
1177 
1178  incompleteVariables = true;
1179  }
1180 
1181  // warn about incomplete variables
1182  if(incompleteVariables) {
1183  this->warning(
1184  "'variables.name' and '.source'"
1185  " should have the same number of elements."
1186  );
1187 
1188  this->warning("Incomplete variable(s) removed from configuration.");
1189 
1190  incompleteVariables = false;
1191  }
1192 
1193  // remove variable queries that are not used, add empty query where none is specified
1194  if(this->config.variablesQuery.size() > completeVariables) {
1195  incompleteVariables = true;
1196  }
1197 
1198  this->config.variablesQuery.resize(completeVariables);
1199 
1200  if(this->config.variablesSkipQuery.size() > completeVariables) {
1201  incompleteVariables = true;
1202  }
1203 
1204  this->config.variablesSkipQuery.resize(completeVariables);
1205 
1206  // remove variable tables that are not used, add empty table where none is specified
1207  if(this->config.variablesParsedTable.size() > completeVariables) {
1208  incompleteVariables = true;
1209  }
1210 
1211  this->config.variablesParsedTable.resize(completeVariables);
1212 
1213  // remove variable columns that are not used, add empty column where none is specified
1214  if(this->config.variablesParsedColumn.size() > completeVariables) {
1215  incompleteVariables = true;
1216  }
1217 
1218  this->config.variablesParsedColumn.resize(completeVariables);
1219 
1220  // remove variable date/time formats that are not used, add empty format where none is specified
1221  if(this->config.variablesDateTimeFormat.size() > completeVariables) {
1222  incompleteVariables = true;
1223  }
1224 
1225  this->config.variablesDateTimeFormat.resize(completeVariables);
1226 
1227  // remove variable date/time locales that are not used, add empty locale where none is specified
1228  if(this->config.variablesDateTimeLocale.size() > completeVariables) {
1229  incompleteVariables = true;
1230  }
1231 
1232  this->config.variablesDateTimeLocale.resize(completeVariables);
1233 
1234  // remove variable aliases that are not used, add empty alias where none is specified
1235  if(this->config.variablesAlias.size() > completeVariables) {
1236  incompleteVariables = true;
1237  }
1238 
1239  this->config.variablesAlias.resize(completeVariables);
1240 
1241  // remove variable alias values that are not used, add empty alias value where none is specified
1242  if(this->config.variablesAliasAdd.size() > completeVariables) {
1243  incompleteVariables = true;
1244  }
1245 
1246  this->config.variablesAliasAdd.resize(completeVariables);
1247 
1248  // warn about unused properties
1249  if(incompleteVariables) {
1250  this->warning("Unused variable properties removed from configuration.");
1251  }
1252 
1253  // check properties of tokens
1254  bool incompleteTokens{false};
1255 
1256  const auto completeTokens{
1257  std::min({
1258  /* number of complete tokens (= min. size of arrays) */
1259  this->config.variablesTokens.size(),
1260  this->config.variablesTokensSource.size(),
1261  this->config.variablesTokensQuery.size()
1262  })
1263  };
1264 
1265  // remove token variable names that are not used
1266  if(this->config.variablesTokens.size() > completeTokens) {
1267  this->config.variablesTokens.resize(completeTokens);
1268 
1269  incompleteTokens = true;
1270  }
1271 
1272  // remove token sources that are not used
1273  if(this->config.variablesTokensSource.size() > completeTokens) {
1274  this->config.variablesTokensSource.resize(completeTokens);
1275 
1276  incompleteTokens = true;
1277  }
1278 
1279  // remove token queries that are not used
1280  if(this->config.variablesTokensQuery.size() > completeTokens) {
1281  this->config.variablesTokensQuery.resize(completeTokens);
1282 
1283  incompleteTokens = true;
1284  }
1285 
1286  // warn about incomplete tokens
1287  if(incompleteTokens) {
1288  this->warning(
1289  "'variables.tokens', '.tokens.source' and '.tokens.query'"
1290  " should have the same number of elements."
1291  );
1292 
1293  this->warning("Incomplete token(s) removed from configuration.");
1294 
1295  incompleteTokens = false;
1296  }
1297 
1298  // remove cookie headers that are not used, set to empty string where none is specified
1299  if(this->config.variablesTokensCookies.size() > completeTokens) {
1300  incompleteTokens = true;
1301  }
1302 
1303  this->config.variablesTokensCookies.resize(completeTokens);
1304 
1305  // remove token POST options that are not used, set to 'false' where none is specified
1306  if(this->config.variablesTokensUsePost.size() > completeTokens) {
1307  incompleteTokens = true;
1308  }
1309 
1310  this->config.variablesTokensUsePost.resize(completeTokens, false);
1311 
1312  // warn about unused property
1313  if(incompleteTokens) {
1314  this->warning(
1315  "Unused token properties removed from configuration."
1316  );
1317  }
1318 
1319  // check properties of date/time queries
1320  bool incompleteDateTimes{false};
1321 
1322  // remove date/time formats that are not used, add empty format where none is specified
1323  if(
1324  this->config.extractingDateTimeFormats.size()
1325  > this->config.extractingDateTimeQueries.size()
1326  ) {
1327  incompleteDateTimes = true;
1328  }
1329 
1330  this->config.extractingDateTimeFormats.resize(
1331  this->config.extractingDateTimeQueries.size()
1332  );
1333 
1334  // remove date/time locales that are not used, add empty locale where none is specified
1335  if(
1336  this->config.extractingDateTimeLocales.size()
1337  > this->config.extractingDateTimeQueries.size()
1338  ) {
1339  incompleteDateTimes = true;
1340  }
1341 
1342  this->config.extractingDateTimeLocales.resize(
1343  this->config.extractingDateTimeQueries.size()
1344  );
1345 
1346  // warn about unused properties
1347  if(incompleteDateTimes) {
1348  this->warning(
1349  "Unused date/time properties removed from configuration."
1350  );
1351  }
1352 
1353  // replace empty date/time formats with "%F %T"
1354  std::replace_if(
1355  this->config.extractingDateTimeFormats.begin(),
1356  this->config.extractingDateTimeFormats.end(),
1357  [](const auto& str) {
1358  return str.empty();
1359  },
1360  "%F %T"
1361  );
1362 
1363  // check properties of fields
1364  const auto completeFields{
1365  std::min(
1366  this->config.extractingFieldNames.size(),
1367  this->config.extractingFieldQueries.size()
1368  )
1369  };
1370 
1371  bool incompleteFields{false};
1372 
1373  // remove field names or queries that are not used
1374  if(this->config.extractingFieldNames.size() > completeFields) {
1375  incompleteFields = true;
1376 
1377  this->config.extractingFieldNames.resize(completeFields);
1378  }
1379  else if(this->config.extractingFieldQueries.size() > completeFields) {
1380  incompleteFields = true;
1381 
1382  this->config.extractingFieldQueries.resize(completeFields);
1383  }
1384 
1385  // warn about incomplete fields
1386  if(incompleteFields) {
1387  this->warning(
1388  "'variables.field.names' and '.field.queries'"
1389  " should have the same number of elements."
1390  );
1391 
1392  this->warning("Incomplete field(s) removed from configuration.");
1393 
1394  incompleteFields = false;
1395  }
1396 
1397  // remove date/time formats that are not used, add empty format where none is specified
1398  if(this->config.extractingFieldDateTimeFormats.size() > completeFields) {
1399  incompleteFields = true;
1400  }
1401 
1402  this->config.extractingFieldDateTimeFormats.resize(completeFields);
1403 
1404  // remove date/time locales that are not used, add empty locale where none is specified
1405  if(this->config.extractingFieldDateTimeLocales.size() > completeFields) {
1406  incompleteFields = true;
1407  }
1408 
1409  this->config.extractingFieldDateTimeLocales.resize(completeFields);
1410 
1411  // remove field delimiters that are not used, add empty delimiter (\0) where none is specified
1412  if(this->config.extractingFieldDelimiters.size() > completeFields) {
1413  incompleteFields = true;
1414  }
1415 
1416  this->config.extractingFieldDelimiters.resize(completeFields, '\0');
1417 
1418  // replace all empty field delimiters with '\n'
1419  std::replace_if(
1420  this->config.extractingFieldDelimiters.begin(),
1421  this->config.extractingFieldDelimiters.end(),
1422  [](char c) {
1423  return c == '\0';
1424  },
1425  '\n'
1426  );
1427 
1428  // remove 'ignore empty values' properties that are not used, set to 'true' where none is specified
1429  if(this->config.extractingFieldIgnoreEmpty.size() > completeFields) {
1430  incompleteFields = true;
1431  }
1432 
1433  this->config.extractingFieldIgnoreEmpty.resize(completeFields, true);
1434 
1435  // remove 'save field as JSON' properties that are not used, set to 'false' where none is specified
1436  if(this->config.extractingFieldJSON.size() > completeFields) {
1437  incompleteFields = true;
1438  }
1439 
1440  this->config.extractingFieldJSON.resize(completeFields, false);
1441 
1442  // remove 'tidy text' properties that are not used, set to 'false' where none is specified
1443  if(this->config.extractingFieldTidyTexts.size() > completeFields) {
1444  incompleteFields = true;
1445  }
1446 
1447  this->config.extractingFieldTidyTexts.resize(completeFields, false);
1448 
1449  // remove 'warning if empty' properties that are not used, set to 'false' where none is specified
1450  if(this->config.extractingFieldWarningsEmpty.size() > completeFields) {
1451  incompleteFields = true;
1452  }
1453 
1454  this->config.extractingFieldWarningsEmpty.resize(completeFields, false);
1455 
1456  // warn about unused properties
1457  if(incompleteFields) {
1458  this->warning(
1459  "Unused field properties for extraction removed from configuration."
1460  );
1461  }
1462 
1463  // check properties of linked fields
1464  const auto completeLinkedFields{
1465  std::min(
1466  this->config.linkedFieldNames.size(),
1467  this->config.linkedFieldQueries.size()
1468  )
1469  };
1470 
1471  incompleteFields = false;
1472 
1473  // remove field names or queries that are not used
1474  if(this->config.linkedFieldNames.size() > completeLinkedFields) {
1475  incompleteFields = true;
1476 
1477  this->config.linkedFieldNames.resize(completeLinkedFields);
1478  }
1479  else if(this->config.linkedFieldQueries.size() > completeLinkedFields) {
1480  incompleteFields = true;
1481 
1482  this->config.linkedFieldQueries.resize(completeLinkedFields);
1483  }
1484 
1485  // warn about incomplete fields
1486  if(incompleteFields) {
1487  this->warning(
1488  "'linked.field.names' and '.field.queries'"
1489  " should have the same number of elements."
1490  );
1491 
1492  this->warning("Incomplete field(s) removed from configuration.");
1493 
1494  incompleteFields = false;
1495  }
1496 
1497  // remove date/time formats that are not used, add empty format where none is specified
1498  if(this->config.linkedDateTimeFormats.size() > completeLinkedFields) {
1499  incompleteFields = true;
1500  }
1501 
1502  this->config.linkedDateTimeFormats.resize(completeFields);
1503 
1504  // remove date/time locales that are not used, add empty locale where none is specified
1505  if(this->config.linkedDateTimeLocales.size() > completeLinkedFields) {
1506  incompleteFields = true;
1507  }
1508 
1509  this->config.linkedDateTimeLocales.resize(completeLinkedFields);
1510 
1511  // remove field delimiters that are not used, add empty delimiter (\0) where none is specified
1512  if(this->config.linkedDelimiters.size() > completeLinkedFields) {
1513  incompleteFields = true;
1514  }
1515 
1516  this->config.linkedDelimiters.resize(completeLinkedFields, '\0');
1517 
1518  // replace all empty field delimiters with '\n'
1519  std::replace_if(
1520  this->config.linkedDelimiters.begin(),
1521  this->config.linkedDelimiters.end(),
1522  [](char c) {
1523  return c == '\0';
1524  },
1525  '\n'
1526  );
1527 
1528  // remove 'ignore empty values' properties that are not used, set to 'true' where none is specified
1529  if(this->config.linkedIgnoreEmpty.size() > completeLinkedFields) {
1530  incompleteFields = true;
1531  }
1532 
1533  this->config.linkedIgnoreEmpty.resize(completeLinkedFields, true);
1534 
1535  // remove 'save field as JSON' properties that are not used, set to 'false' where none is specified
1536  if(this->config.linkedJSON.size() > completeLinkedFields) {
1537  incompleteFields = true;
1538  }
1539 
1540  this->config.linkedJSON.resize(completeLinkedFields, false);
1541 
1542  // remove 'tidy text' properties that are not used, set to 'false' where none is specified
1543  if(this->config.linkedTidyTexts.size() > completeLinkedFields) {
1544  incompleteFields = true;
1545  }
1546 
1547  this->config.linkedTidyTexts.resize(completeLinkedFields, false);
1548 
1549  // remove 'warning if empty' properties that are not used, set to 'false' where none is specified
1550  if(this->config.linkedWarningsEmpty.size() > completeLinkedFields) {
1551  incompleteFields = true;
1552  }
1553 
1554  this->config.linkedWarningsEmpty.resize(completeLinkedFields, false);
1555 
1556  // warn about unused properties
1557  if(incompleteFields) {
1558  this->warning(
1559  "Unused field properties for linked data removed from configuration."
1560  );
1561  }
1562  }
1563 
1565  inline void Config::reset() {
1566  this->config = {};
1567  }
1568 
1569  /*
1570  * INTERNAL HELPER FUNCTION (private)
1571  */
1572 
1573  // remove obvious protocol(s) from beginning of URL
1574  inline void Config::removeProtocolsFromUrl(std::string& inOut) {
1575  for(const auto& protocol : protocolsToRemove) {
1576  while(
1577  inOut.length() >= protocol.length() &&
1578  inOut.substr(0, protocol.length()) == protocol.data()
1579  ) {
1580  inOut = inOut.substr(protocol.length());
1581  }
1582  }
1583  }
1584 
1585 } /* namespace crawlservpp::Module::Extractor */
1586 
1587 #endif /* MODULE_EXTRACTOR_CONFIG_HPP_ */
std::vector< std::uint64_t > variablesTokensQuery
Query to extract token variable with the same array index.
Definition: Config.hpp:367
std::vector< std::string > variablesAlias
Alias for the variable with same array index.
Definition: Config.hpp:239
bool extractingRepairCData
Specifies whether to (try to) repair CData when parsing HTML/XML.
Definition: Config.hpp:747
std::uint8_t generalLogging
Level of logging activity.
Definition: Config.hpp:168
Class for extractor configuration exceptions.
Definition: Config.hpp:988
void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
void checkOptions() override
Checks the extractor-specific configuration options.
Definition: Config.hpp:1139
std::vector< std::uint64_t > variablesQuery
Query on the content or URL for the variable with the same array index.
Definition: Config.hpp:322
std::vector< bool > extractingFieldIgnoreEmpty
Specifies whether to ignore empty values when parsing multiple results for the field with the same ar...
Definition: Config.hpp:651
bool generalMinimizeMemory
Specifies whether to free small amounts of unused memory more often, at the expense of performance...
Definition: Config.hpp:174
std::uint64_t generalCacheSize
Number of URLs fetched and extracted from before saving results.
Definition: Config.hpp:159
std::string expectedParsedTable
Name of the table containing the expected number of datasets.
Definition: Config.hpp:956
std::vector< std::uint64_t > linkedFieldQueries
Query used to extract the custom field with the same array index from the dataset.
Definition: Config.hpp:869
std::string expectedParsedColumn
Parsed column containing the expected number of datasets.
Definition: Config.hpp:946
std::uint64_t pagingNumberFrom
Query to determine the total number of pages from the content of the first page.
Definition: Config.hpp:484
std::vector< std::uint64_t > extractingRecursive
Queries for extracting more datasets from a dataset.
Definition: Config.hpp:738
std::uint64_t generalSleepHttp
Time that will be waited between HTTP requests, in milliseconds.
Definition: Config.hpp:200
std::int64_t generalReTries
Number of re-tries on connection errors.
Definition: Config.hpp:188
constexpr std::uint8_t expectedSourceParsed
Extract data from parsed data.
Definition: Config.hpp:80
constexpr std::uint64_t defaultSleepIdleMs
Default time to wait before checking for new URLs when all URLs have been processed, in milliseconds.
Definition: Config.hpp:110
std::vector< std::string > extractingDateTimeFormats
Format of date/time to be extracted by the date/time query with the same array index.
Definition: Config.hpp:572
bool extractingRemoveXmlInstructions
Specifies whether to remove XML processing instructions (<?xml:...>) before parsing HTML content...
Definition: Config.hpp:753
constexpr std::uint64_t defaultSleepMySqlS
Default time to wait before last try to re-connect to MySQL server, in seconds.
Definition: Config.hpp:113
std::string sourceUrl
URL to retrieve data from.
Definition: Config.hpp:521
std::uint8_t expectedSource
Source of the query to retrieve the expected number of datasets.
Definition: Config.hpp:977
std::vector< std::uint64_t > extractingDatasetQueries
Queries to extract datasets.
Definition: Config.hpp:552
std::uint64_t pagingNextFrom
Query on page content to find the number(s) or name(s) of additional pages.
Definition: Config.hpp:471
constexpr std::uint8_t variablesSourcesContent
Extract variable value from the content of a crawled web page.
Definition: Config.hpp:71
bool extractingRemoveDuplicates
Specifies whether to remove duplicate datasets over multiple pages before checking the expected numbe...
Definition: Config.hpp:744
bool extractingOverwrite
Specifies whether, if a dataset with the same ID already exists, it will be overwritten.
Definition: Config.hpp:731
Abstract class containing the network-specific configuration for threads.
Definition: Config.hpp:121
std::vector< std::string > extractingFieldDateTimeLocales
Locale used when converting the field with the same array index to a date/time.
Definition: Config.hpp:634
std::vector< std::string > variablesDateTimeFormat
Date/time format to be used for the variable with the same array index.
Definition: Config.hpp:267
constexpr std::uint8_t generalLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:62
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::string generalTargetTable
Name of table to save extracted data to.
Definition: Config.hpp:180
std::vector< std::string > linkedIdIgnore
IDs of linked data to be ignored.
Definition: Config.hpp:875
std::uint64_t pagingIsNextFrom
Query on page content to determine whether there is another page.
Definition: Config.hpp:459
std::vector< std::uint64_t > extractingDateTimeQueries
Queries used for extracting date/time from the dataset.
Definition: Config.hpp:590
constexpr std::array protocolsToRemove
Protocols to remove from URLs.
Definition: Config.hpp:89
std::int64_t pagingFirst
Number of the first page.
Definition: Config.hpp:441
std::vector< std::string > variablesTokensCookies
Custom HTTP Cookie header for the token variable with the same array index.
Definition: Config.hpp:361
std::int64_t pagingAliasAdd
Value to add to the alias for the paging variable.
Definition: Config.hpp:435
constexpr std::uint8_t expectedSourceContent
Extract data from the content of a crawled web page.
Definition: Config.hpp:83
std::vector< std::uint64_t > extractingFieldQueries
The query used to extract the custom field with the same array index from the data.
Definition: Config.hpp:702
bool generalReExtract
Specifies whether to re-extract data from already processed URLs.
Definition: Config.hpp:177
std::vector< std::string > extractingFieldDateTimeFormats
Date/time format of the field with the same array index.
Definition: Config.hpp:623
std::uint64_t extractingRecursiveMaxDepth
Maximum depth of recursive extracting.
Definition: Config.hpp:741
std::vector< bool > variablesTokensUsePost
Specifies whether to use HTTP POST instead of GET for the token variable with the same array index...
Definition: Config.hpp:404
void parseOption() override
Parses an extractor-specific configuration option.
Definition: Config.hpp:1014
std::vector< bool > linkedWarningsEmpty
Specifies whether to write a warning to the log when the field with the same array index is empty...
Definition: Config.hpp:926
std::vector< char > extractingFieldDelimiters
Delimiter between multiple results for the field with the same array index, if not saved as JSON...
Definition: Config.hpp:643
std::vector< std::string > linkedDateTimeFormats
Date/time format of the linked field with the same array index.
Definition: Config.hpp:788
std::vector< bool > linkedJSON
Specfies whether to save the value of the field with the same array index as a JSON array...
Definition: Config.hpp:899
bool expectedErrorIfSmaller
Specifies whether to throw an exception when the number of expected datasets is subceeded.
Definition: Config.hpp:936
std::vector< std::uint64_t > extractingErrorRetry
Queries to detect temporary errors in the data.
Definition: Config.hpp:604
std::string sourceUrlFirst
URL of the first page to retrieve data from.
Definition: Config.hpp:531
constexpr std::uint64_t defaultSleepErrorMs
Default sleeping time on connection errors, in milliseconds.
Definition: Config.hpp:104
std::vector< std::string > linkedDateTimeLocales
Date/time locale of the linked field with the same array index.
Definition: Config.hpp:799
std::uint64_t generalSleepMySql
Time to wait before last try to re-connect to mySQL server, in seconds.
Definition: Config.hpp:206
std::uint64_t generalSleepIdle
Time to wait before checking for new URLs when all URLs have been processed, in milliseconds.
Definition: Config.hpp:203
void reset() override
Resets the extractor-specific configuration options.
Definition: Config.hpp:1565
bool generalTidyWarnings
Specifies whether to write tidyhtml warnings to the log.
Definition: Config.hpp:220
std::vector< std::uint64_t > extractingErrorFail
Queries to detect fatal errors in the data.
Definition: Config.hpp:597
std::vector< std::string > variablesTokenHeaders
Custom HTTP headers to be used for ALL token variables.
Definition: Config.hpp:410
constexpr std::int64_t defaultReTries
Default re-tries on connection error.
Definition: Config.hpp:101
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::vector< bool > extractingFieldJSON
Save the value of the field with the same array index as a JSON array.
Definition: Config.hpp:657
std::vector< char > linkedDelimiters
Delimiter between multiple results for the field with the same array index, if not saved as JSON...
Definition: Config.hpp:808
bool extractingRepairComments
Specifies whether to (try to) repair broken HTML/XML comments.
Definition: Config.hpp:750
std::vector< std::uint8_t > variablesSource
Source of the variable with the same array index.
Definition: Config.hpp:340
bool sourceUsePost
Specifies whether to use HTTP POST instead of HTTP GET for extracting data.
Definition: Config.hpp:541
std::vector< std::uint64_t > linkedIdQueries
Queries to extract the linked ID from the dataset.
Definition: Config.hpp:885
std::vector< bool > linkedTidyTexts
Specifies whether to remove line breaks and unnecessary whitespaces when extracting the linked field ...
Definition: Config.hpp:917
constexpr std::uint8_t variablesSourcesParsed
Extract variable value from parsed data.
Definition: Config.hpp:68
std::string sourceCookies
Custom HTTP Cookie header used when retrieving data.
Definition: Config.hpp:510
std::vector< std::string > variablesParsedTable
Name of the table containing the parsed data for the variable with the same array index...
Definition: Config.hpp:312
bool linkedOverwrite
Specifies whether, if a linked dataset with the same ID already exists, it will be overwritten...
Definition: Config.hpp:908
constexpr std::uint64_t defaultSleepHttpMs
Default time that will be waited between HTTP requests, in milliseconds.
Definition: Config.hpp:107
std::vector< std::string > variablesTokens
List of token variables.
Definition: Config.hpp:355
constexpr std::uint8_t crawlerLoggingVerbose
Logging is disabled.
Definition: Config.hpp:56
constexpr std::uint8_t generalLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:65
bool generalExtractCustom
Specifies whether to include custom URLs when extracting.
Definition: Config.hpp:162
constexpr std::uint8_t variablesSourcesUrl
Extract variable value from the URL of a crawled web page.
Definition: Config.hpp:74
std::vector< bool > linkedIgnoreEmpty
Specifies whether to ignore empty values when parsing multiple results for the field with the same ar...
Definition: Config.hpp:893
std::uint16_t generalMaxBatchSize
Maximum number of URLs and results processed in one MySQL query.
Definition: Config.hpp:171
std::uint64_t generalSleepError
Sleeping time (in ms) on connection errors, in milliseconds.
Definition: Config.hpp:197
std::vector< std::uint32_t > generalRetryHttp
HTTP errors that will be handled like connection errors.
Definition: Config.hpp:191
Configuration entries for extractor threads.
Definition: Config.hpp:151
std::vector< bool > extractingFieldTidyTexts
Specifies whether to remove line breaks and unnecessary whitespaces when extracting the field with th...
Definition: Config.hpp:708
std::string linkedTargetTable
Name of the table to save linked data to.
Definition: Config.hpp:911
std::vector< std::string > variablesParsedColumn
Parsed column for the value of the variable with the same array index.
Definition: Config.hpp:302
std::vector< std::string > variablesTokensSource
Source URL for the token variable with the same array index.
Definition: Config.hpp:398
struct crawlservpp::Module::Extractor::Config::Entries config
Configuration of the extractor.
constexpr std::uint64_t defaultRecursiveMaxDepth
Default maximum depth of recursive extracting.
Definition: Config.hpp:128
constexpr std::uint16_t defaultMaxBatchSize
Default number of URLs and results to be processed in one MySQL query.
Definition: Config.hpp:98
std::uint32_t generalLock
URL locking time, in seconds.
Definition: Config.hpp:165
Configuration for extractors.
Definition: Config.hpp:137
constexpr std::uint64_t defaultCacheSize
Default cache size.
Definition: Config.hpp:92
std::string pagingFirstString
Name of the first page.
Definition: Config.hpp:452
constexpr std::array defaultRetryHttpStatusCodes
HTTP status codes to retry by default.
Definition: Config.hpp:86
constexpr auto defaultPagingVariable
Default name of the paging variable.
Definition: Config.hpp:125
std::vector< std::uint64_t > variablesSkipQuery
Queries to be used on the value of the variable with the same array index to determine whether to ski...
Definition: Config.hpp:281
std::vector< std::string > linkedFieldNames
Names of the linked data fields.
Definition: Config.hpp:863
std::vector< std::uint64_t > extractingIdQueries
Queries to extract the ID from the dataset.
Definition: Config.hpp:728
constexpr std::uint8_t expectedSourceExtracting
Extract data from other extracted data.
Definition: Config.hpp:77
Namespace for extractor classes.
Definition: Config.hpp:44
std::vector< std::string > sourceHeaders
Custom HTTP headers used when retrieving data.
Definition: Config.hpp:513
std::vector< std::int64_t > variablesAliasAdd
Value to add to the variable alias with the same array index.
Definition: Config.hpp:248
std::vector< std::string > variablesDateTimeLocale
Date/time locale to be used for the variable with the same array index.
Definition: Config.hpp:278
std::uint64_t extractingSkipQuery
Extracting will proceed to the next URL if the current page fulfills this query.
Definition: Config.hpp:756
constexpr std::uint32_t defaultLockS
Default locking time, in seconds.
Definition: Config.hpp:95
bool expectedErrorIfLarger
Specifies whether to throw an exception when the number of expected datasets is exceeded.
Definition: Config.hpp:933
std::string pagingAlias
Alias for the paging variable.
Definition: Config.hpp:426
std::vector< std::string > extractingDateTimeLocales
Locale used by the date/time query with the same array index for extracting date and time...
Definition: Config.hpp:580
std::vector< std::string > variablesName
Variable names.
Definition: Config.hpp:293
std::uint32_t generalTidyErrors
Number of tidyhtml errors to write to the log.
Definition: Config.hpp:213
bool generalTiming
Specifies whether to calculate timing statistics for the extractor.
Definition: Config.hpp:223
constexpr std::uint8_t generalLoggingDefault
Default logging is enabled.
Definition: Config.hpp:59
std::vector< std::uint64_t > linkedDatasetQueries
Queries to extract linked datasets.
Definition: Config.hpp:769
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
std::int64_t pagingStep
Number to add to page variable for retrieving the next page, if a page number is used.
Definition: Config.hpp:491
std::string linkedLink
Name of the extracted field that links an extracted dataset to the ID of a linked dataset...
Definition: Config.hpp:905
std::uint64_t expectedQuery
Query to be performed to retrieve the expected number of datasets.
Definition: Config.hpp:966
std::vector< std::string > extractingFieldNames
The names of the custom fields to extract.
Definition: Config.hpp:696
Configuration item containing its category, name, and JSON value.
Definition: ConfigItem.hpp:41
std::vector< bool > extractingFieldWarningsEmpty
Specifies whether to write a warning to the log when the field with the same array index is empty...
Definition: Config.hpp:717
std::vector< std::string > extractingIdIgnore
Extracted IDs to be ignored.
Definition: Config.hpp:720
std::string pagingVariable
Name of the paging variable.
Definition: Config.hpp:503